Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -785,18 +785,24 @@
       // of casting the original vector twice. We also need to factor in the
       // cost of the split itself. Count that as 1, to be consistent with
       // TLI->getTypeLegalizationCost().
-      if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
-               TargetLowering::TypeSplitVector ||
-           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
-               TargetLowering::TypeSplitVector) &&
-          SrcVTy->getNumElements() > 1 && DstVTy->getNumElements() > 1) {
-        Type *SplitDst = VectorType::get(DstVTy->getElementType(),
-                                         DstVTy->getNumElements() / 2);
-        Type *SplitSrc = VectorType::get(SrcVTy->getElementType(),
-                                         SrcVTy->getNumElements() / 2);
+      bool SplitSrc =
+          TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
+          TargetLowering::TypeSplitVector;
+      bool SplitDst =
+          TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
+          TargetLowering::TypeSplitVector;
+      if ((SplitSrc || SplitDst) && SrcVTy->getNumElements() > 1 &&
+          DstVTy->getNumElements() > 1) {
+        Type *SplitDstTy = VectorType::get(DstVTy->getElementType(),
+                                           DstVTy->getNumElements() / 2);
+        Type *SplitSrcTy = VectorType::get(SrcVTy->getElementType(),
+                                           SrcVTy->getNumElements() / 2);
         T *TTI = static_cast<T *>(this);
-        return TTI->getVectorSplitCost() +
-               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
+        // If both types need to be split then the split is free.
+        unsigned SplitCost =
+            (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
+        return SplitCost +
+               (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, I));
       }
 
       // In other cases where the source or destination are illegal, assume
Index: llvm/test/Analysis/CostModel/ARM/cast.ll
===================================================================
--- llvm/test/Analysis/CostModel/ARM/cast.ll
+++ llvm/test/Analysis/CostModel/ARM/cast.ll
@@ -149,8 +149,8 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
@@ -159,18 +159,18 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
@@ -179,18 +179,18 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
@@ -229,8 +229,8 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
@@ -239,8 +239,8 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
@@ -249,8 +249,8 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
@@ -259,18 +259,18 @@
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
 ; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'casts'
@@ -371,13 +371,13 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 656 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
@@ -416,8 +416,8 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
@@ -426,38 +426,38 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 661 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 656 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 656 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
@@ -496,8 +496,8 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
@@ -506,8 +506,8 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
@@ -516,28 +516,28 @@
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1045 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1044 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1044 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1044 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1044 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
 ; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'casts'
@@ -621,30 +621,30 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
@@ -653,16 +653,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
@@ -673,16 +673,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
@@ -693,16 +693,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
@@ -713,16 +713,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
@@ -733,16 +733,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
@@ -753,16 +753,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
@@ -773,16 +773,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
@@ -793,16 +793,16 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -888,30 +888,30 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s72 = zext <4 x i8> undef to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r72 = zext <8 x i8> undef to <8 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r73 = zext <16 x i8> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_0 = sext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_1 = zext <8 x i8> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %rext_2 = sext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %rext_3 = zext <8 x i16> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %rext_4 = sext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %rext_5 = zext <4 x i16> undef to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_6 = sext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_7 = zext <2 x i8> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %rext_8 = sext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_9 = zext <2 x i16> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_a = sext <2 x i32> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %rext_b = zext <2 x i32> undef to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r74 = trunc <8 x i32> undef to <8 x i8>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r75 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r86 = fpext <2 x float> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r87 = fpext <4 x float> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r88 = fpext <8 x float> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r89 = fpext <16 x float> undef to <16 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8>
@@ -920,16 +920,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1>
@@ -940,16 +940,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
@@ -960,16 +960,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
@@ -980,16 +980,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
@@ -1000,16 +1000,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float>
@@ -1020,16 +1020,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
@@ -1040,16 +1040,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
@@ -1060,16 +1060,16 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -1217,8 +1217,8 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8>
@@ -1227,18 +1227,18 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8>
@@ -1247,18 +1247,18 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 263 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float>
@@ -1297,8 +1297,8 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float>
@@ -1307,8 +1307,8 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 115 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double>
@@ -1317,8 +1317,8 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float>
@@ -1327,18 +1327,18 @@
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 228 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
 ; CHECK-V8R-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   ; -- scalars --
@@ -1742,14 +1742,14 @@
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'load_extends'
@@ -1778,14 +1778,14 @@
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'load_extends'
Index: llvm/test/Analysis/CostModel/X86/arith-fix.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -35,77 +35,77 @@
 ; SSSE3-LABEL: 'smul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'smul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'smul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'smul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'smul'
@@ -168,58 +168,58 @@
 ; SLM-LABEL: 'smul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'smul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'smul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3)
@@ -269,77 +269,77 @@
 ; SSSE3-LABEL: 'umul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'umul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'umul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'umul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'umul'
@@ -402,58 +402,58 @@
 ; SLM-LABEL: 'umul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'umul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'umul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3)
Index: llvm/test/Analysis/CostModel/X86/arith-overflow.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -991,77 +991,77 @@
 ; SSSE3-LABEL: 'smul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'smul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'smul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'smul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'smul'
@@ -1124,58 +1124,58 @@
 ; SLM-LABEL: 'smul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'smul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'smul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -1229,77 +1229,77 @@
 ; SSSE3-LABEL: 'umul'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'umul'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'umul'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'umul'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'umul'
@@ -1362,58 +1362,58 @@
 ; SLM-LABEL: 'umul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'umul'
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; GLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; GLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'umul'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
Index: llvm/test/Analysis/CostModel/X86/cast.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/cast.ll
+++ llvm/test/Analysis/CostModel/X86/cast.ll
@@ -85,8 +85,8 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D = zext <4 x i32> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
@@ -95,7 +95,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -115,8 +115,8 @@
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D = zext <4 x i32> undef to <4 x i64>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D3 = zext <16 x i16> undef to <16 x i32>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D4 = zext <16 x i8> undef to <16 x i32>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %D5 = zext <16 x i1> undef to <16 x i32>
@@ -125,7 +125,7 @@
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -529,9 +529,9 @@
 define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
 ; SSE-LABEL: 'fp_conv'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'fp_conv'
Index: llvm/test/Analysis/CostModel/X86/extend.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/extend.ll
+++ llvm/test/Analysis/CostModel/X86/extend.ll
@@ -16,21 +16,21 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %I64 = zext i32 undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'zext_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %I64 = zext i32 undef to i64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'zext_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %I64 = zext i32 undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i32> undef to <2 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = zext <4 x i32> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = zext <8 x i32> undef to <8 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'zext_vXi32'
@@ -184,7 +184,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'zext_vXi8'
@@ -202,7 +202,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'zext_vXi8'
@@ -220,7 +220,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i8> undef to <8 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = zext <16 x i8> undef to <16 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = zext <32 x i8> undef to <32 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'zext_vXi8'
@@ -350,7 +350,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -519,21 +519,21 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i32 undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sext_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i32 undef to i64
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sext_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i32 undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i32> undef to <2 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = sext <4 x i32> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = sext <8 x i32> undef to <8 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sext_vXi32'
@@ -687,7 +687,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sext_vXi8'
@@ -705,7 +705,7 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sext_vXi8'
@@ -723,7 +723,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = sext <8 x i8> undef to <8 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = sext <16 x i8> undef to <16 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = sext <32 x i8> undef to <32 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sext_vXi8'
@@ -853,7 +853,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
Index: llvm/test/Analysis/CostModel/X86/fptosi.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -14,22 +14,22 @@
 ; SSE2-LABEL: 'fptosi_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_double_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptosi_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_double_i64'
@@ -49,8 +49,8 @@
 ; SLM-LABEL: 'fptosi_double_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptosi double undef to i64
@@ -65,7 +65,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptosi_double_i32'
@@ -166,16 +166,16 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_float_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptosi_float_i64'
@@ -183,7 +183,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_float_i64'
@@ -206,8 +206,8 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptosi float undef to i64
@@ -257,7 +257,7 @@
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptosi_float_i16'
Index: llvm/test/Analysis/CostModel/X86/fptoui.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -14,22 +14,22 @@
 ; SSE2-LABEL: 'fptoui_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_double_i64'
@@ -49,8 +49,8 @@
 ; SLM-LABEL: 'fptoui_double_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptoui double undef to i64
@@ -65,14 +65,14 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_double_i32'
@@ -93,7 +93,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptoui double undef to i32
@@ -180,16 +180,16 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 119 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 116 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_float_i64'
@@ -197,7 +197,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_float_i64'
@@ -220,8 +220,8 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = fptoui float undef to i64
@@ -237,16 +237,16 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_float_i32'
@@ -254,7 +254,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_float_i32'
@@ -269,8 +269,8 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptoui float undef to i32
@@ -287,7 +287,7 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i16'
@@ -295,7 +295,7 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_float_i16'
@@ -319,7 +319,7 @@
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I16 = fptoui float undef to i16
Index: llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1337,31 +1337,31 @@
 
 define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1381,31 +1381,31 @@
 
 define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1425,31 +1425,31 @@
 
 define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1471,7 +1471,7 @@
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1479,7 +1479,7 @@
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1487,7 +1487,7 @@
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1495,7 +1495,7 @@
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
@@ -1503,7 +1503,7 @@
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
-; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
Index: llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -268,13 +268,13 @@
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i8 = zext <8 x i1> undef to <8 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i8 = zext <16 x i1> undef to <16 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i8 = zext <32 x i1> undef to <32 x i8>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64i8 = zext <64 x i1> undef to <64 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'zext256_vXi1'
@@ -406,14 +406,14 @@
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i8 = sext <8 x i1> undef to <8 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i8 = sext <16 x i1> undef to <16 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i8 = sext <32 x i1> undef to <32 x i8>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i8 = sext <64 x i1> undef to <64 x i8>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'sext256_vXi1'
@@ -527,19 +527,19 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -553,26 +553,26 @@
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'trunc_vXi1'
@@ -580,20 +580,20 @@
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
Index: llvm/test/Analysis/CostModel/X86/sitofp.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/sitofp.ll
+++ llvm/test/Analysis/CostModel/X86/sitofp.ll
@@ -109,7 +109,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sitofp_i64_double'
@@ -213,7 +213,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'sitofp_i32_float'
@@ -246,7 +246,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sitofp_i64_float'
Index: llvm/test/Analysis/CostModel/X86/trunc.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/trunc.ll
+++ llvm/test/Analysis/CostModel/X86/trunc.ll
@@ -15,15 +15,15 @@
 ; SSE-LABEL: 'trunc_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'trunc_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'trunc_vXi32'
@@ -45,39 +45,39 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'trunc_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'trunc_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'trunc_vXi16'
@@ -85,12 +85,12 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'trunc_vXi16'
@@ -98,12 +98,12 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'trunc_vXi16'
@@ -124,12 +124,12 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2i64 = trunc <2 x i64> undef to <2 x i16>
@@ -152,20 +152,20 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'trunc_vXi8'
@@ -173,20 +173,20 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'trunc_vXi8'
@@ -194,20 +194,20 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'trunc_vXi8'
@@ -216,19 +216,19 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'trunc_vXi8'
@@ -237,19 +237,19 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'trunc_vXi8'
@@ -300,19 +300,19 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V2i64 = trunc <2 x i64> undef to <2 x i8>
@@ -345,20 +345,20 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -372,20 +372,20 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -399,20 +399,20 @@
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -427,19 +427,19 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -453,20 +453,20 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
Index: llvm/test/Analysis/CostModel/X86/uitofp.ll
===================================================================
--- llvm/test/Analysis/CostModel/X86/uitofp.ll
+++ llvm/test/Analysis/CostModel/X86/uitofp.ll
@@ -109,7 +109,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uitofp_i64_double'
@@ -213,7 +213,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'uitofp_i32_float'
@@ -221,7 +221,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'uitofp_i32_float'
@@ -237,7 +237,7 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %cvt_i32_f32 = uitofp i32 undef to float
@@ -262,7 +262,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uitofp_i64_float'