diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2393,10 +2393,11 @@
    };
 
   assert(Val->isVectorTy() && "This must be a vector type");
-
   Type *ScalarType = Val->getScalarType();
+  int RegisterFileMoveCost = 0;
 
-  if (Index != -1U) {
+  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+                       Opcode == Instruction::InsertElement)) {
     // Legalize the type.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
@@ -2405,17 +2406,32 @@
       return 0;
 
     // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
+    unsigned NumElts = LT.second.getVectorNumElements();
+    unsigned SubNumElts = NumElts;
+    Index = Index % NumElts;
+
+    // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+    // For inserts, we also need to insert the subvector back.
+    if (LT.second.getSizeInBits() > 128) {
+      assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+      unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+      SubNumElts = NumElts / NumSubVecs;
+      if (SubNumElts <= Index) {
+        RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+        Index %= SubNumElts;
+      }
+    }
 
     if (Index == 0) {
       // Floating point scalars are already located in index #0.
+      // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+      // true for all.
       if (ScalarType->isFloatingPointTy())
-        return 0;
+        return RegisterFileMoveCost;
 
-      // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
-      if (ScalarType->isIntegerTy())
-        return 1;
+      // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+      if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+        return 1 + RegisterFileMoveCost;
     }
 
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2423,14 +2439,36 @@
     MVT MScalarTy = LT.second.getScalarType();
     if (ST->isSLM())
       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
-        return Entry->Cost;
+        return Entry->Cost + RegisterFileMoveCost;
+
+    // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+        (MScalarTy.isInteger() && ST->hasSSE41()))
+      return 1 + RegisterFileMoveCost;
+
+    // Assume insertps is relatively cheap on all targets.
+    if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+        Opcode == Instruction::InsertElement)
+      return 1 + RegisterFileMoveCost;
+
+    // For extractions we just need to shuffle the element to index 0, which
+    // should be very cheap (assume cost = 1). For insertions we need to shuffle
+    // the elements to its destination. In both cases we must handle the
+    // subvector move(s).
+    // TODO: Under what circumstances should we shuffle using the full width?
+    int ShuffleCost = 1;
+    if (Opcode == Instruction::InsertElement) {
+      Type *SubTy = VectorType::get(Val->getVectorElementType(), SubNumElts);
+      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+    }
+    int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+    return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
   }
 
   // Add to the base cost if we know that the extracted element of a vector is
   // destined to be moved to and used in the integer register file.
-  int RegisterFileMoveCost = 0;
   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
-    RegisterFileMoveCost = 1;
+    RegisterFileMoveCost += 1;
 
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -673,9 +673,9 @@
 define i32 @frem(i32 %arg) {
 ; SSE1-LABEL: 'frem'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -684,9 +684,9 @@
 ;
 ; SSE2-LABEL: 'frem'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -707,23 +707,23 @@
 ; AVX-LABEL: 'frem'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = frem <16 x float> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'frem'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16F32 = frem <16 x float> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'frem'
@@ -1059,9 +1059,9 @@
 define i32 @fma(i32 %arg) {
 ; SSE1-LABEL: 'fma'
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SSE1-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SSE1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
 ; SSE1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1070,9 +1070,9 @@
 ;
 ; SSE2-LABEL: 'fma'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1093,12 +1093,12 @@
 ; AVX-LABEL: 'fma'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fma'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -374,8 +374,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -399,8 +399,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -414,7 +414,7 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
@@ -424,8 +424,8 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -439,7 +439,7 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = zext i1 undef to i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = zext <2 x i1> undef to <2 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i1> undef to <4 x i64>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = zext <8 x i1> undef to <8 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = zext i1 undef to i32
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = zext <2 x i1> undef to <2 x i32>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = zext <4 x i1> undef to <4 x i32>
@@ -474,8 +474,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i1> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i1> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = zext <8 x i1> undef to <8 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = zext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = zext <32 x i1> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = zext i1 undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = zext <2 x i1> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = zext <4 x i1> undef to <4 x i8>
@@ -877,8 +877,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -902,8 +902,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -917,7 +917,7 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
@@ -927,8 +927,8 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
@@ -942,7 +942,7 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
@@ -977,8 +977,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i1> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i1> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = sext <8 x i1> undef to <8 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16i16 = sext <16 x i1> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V32i16 = sext <32 x i1> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sext i1 undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i8 = sext <2 x i1> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = sext <4 x i1> undef to <4 x i8>
diff --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -13,9 +13,9 @@
 define i32 @fptosi_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_double_i64'
@@ -28,15 +28,15 @@
 ; AVX-LABEL: 'fptosi_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_double_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptosi_double_i64'
@@ -121,9 +121,9 @@
 define i32 @fptosi_double_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_double_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_double_i8'
@@ -164,10 +164,10 @@
 define i32 @fptosi_float_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_float_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_float_i64'
@@ -181,17 +181,17 @@
 ; AVX-LABEL: 'fptosi_float_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptosi_float_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptosi_float_i64'
@@ -265,9 +265,9 @@
 define i32 @fptosi_float_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_float_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptosi_float_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -13,9 +13,9 @@
 define i32 @fptoui_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i64'
@@ -28,15 +28,15 @@
 ; AVX-LABEL: 'fptoui_double_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_double_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptoui_double_i64'
@@ -63,9 +63,9 @@
 define i32 @fptoui_double_i32(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i32'
@@ -149,9 +149,9 @@
 define i32 @fptoui_double_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_double_i8'
@@ -192,10 +192,10 @@
 define i32 @fptoui_float_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i64'
@@ -209,17 +209,17 @@
 ; AVX-LABEL: 'fptoui_float_i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 123 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'fptoui_float_i64'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'fptoui_float_i64'
@@ -249,9 +249,9 @@
 define i32 @fptoui_float_i32(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i32'
@@ -335,9 +335,9 @@
 define i32 @fptoui_float_i8(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_float_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'fptoui_float_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/fround.ll b/llvm/test/Analysis/CostModel/X86/fround.ll
--- a/llvm/test/Analysis/CostModel/X86/fround.ll
+++ b/llvm/test/Analysis/CostModel/X86/fround.ll
@@ -16,9 +16,9 @@
 define i32 @ceil(i32 %arg) {
 ; SSE2-LABEL: 'ceil'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.ceil.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.ceil.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.ceil.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.ceil.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.ceil.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.ceil.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.ceil.v4f64(<4 x double> undef)
@@ -96,9 +96,9 @@
 define i32 @floor(i32 %arg) {
 ; SSE2-LABEL: 'floor'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.floor.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.floor.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.floor.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.floor.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.floor.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.floor.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.floor.v4f64(<4 x double> undef)
@@ -176,9 +176,9 @@
 define i32 @nearbyint(i32 %arg) {
 ; SSE2-LABEL: 'nearbyint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.nearbyint.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.nearbyint.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef)
@@ -256,9 +256,9 @@
 define i32 @rint(i32 %arg) {
 ; SSE2-LABEL: 'rint'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.rint.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.rint.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.rint.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.rint.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.rint.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.rint.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.rint.v4f64(<4 x double> undef)
@@ -336,9 +336,9 @@
 define i32 @trunc(i32 %arg) {
 ; SSE2-LABEL: 'trunc'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.trunc.f32(float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.trunc.v4f32(<4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.trunc.v8f32(<8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.trunc.v16f32(<16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.trunc.f64(double undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.trunc.v2f64(<2 x double> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.trunc.v4f64(<4 x double> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
@@ -22,7 +22,7 @@
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1':
-; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
+; CORE2: Cost Model: Found an estimated cost of 49 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1':
 ; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
@@ -50,7 +50,7 @@
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test2':
-; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
+; CORE2: Cost Model: Found an estimated cost of 49 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test2':
 ; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
diff --git a/llvm/test/Analysis/CostModel/X86/load_store.ll b/llvm/test/Analysis/CostModel/X86/load_store.ll
--- a/llvm/test/Analysis/CostModel/X86/load_store.ll
+++ b/llvm/test/Analysis/CostModel/X86/load_store.ll
@@ -72,10 +72,10 @@
   ;CHECK: cost of 3 {{.*}} load
   load <3 x i64>, <3 x i64>* undef, align 4
 
-  ;CHECK: cost of 10 {{.*}} load
+  ;CHECK: cost of 12 {{.*}} load
   load <5 x i32>, <5 x i32>* undef, align 4
 
-  ;CHECK: cost of 10 {{.*}} load
+  ;CHECK: cost of 14 {{.*}} load
   load <5 x i64>, <5 x i64>* undef, align 4
 
   ret i32 undef
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -9,32 +9,59 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze | FileCheck %s --check-prefixes=CHECK,AVX512,SKX
 
 define i32 @masked_load() {
-; SSE-LABEL: 'masked_load'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1148 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 574 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 287 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
@@ -53,12 +80,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -80,12 +107,12 @@
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -151,32 +178,59 @@
 }
 
 define i32 @masked_store() {
-; SSE-LABEL: 'masked_store'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> undef, <4 x double>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> undef, <2 x double>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> undef, <1 x double>* undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> undef, <16 x float>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> undef, <8 x float>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> undef, <4 x float>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> undef, <2 x float>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0v8i64(<8 x i64> undef, <8 x i64>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> undef, <4 x i64>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> undef, <2 x i64>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0v1i64(<1 x i64> undef, <1 x i64>* undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
@@ -195,12 +249,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -222,12 +276,12 @@
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -293,83 +347,110 @@
 }
 
 define i32 @masked_gather() {
-; SSE-LABEL: 'masked_gather'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -391,19 +472,19 @@
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 1, <2 x i1> undef, <2 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 1, <1 x i1> undef, <1 x double> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef)
@@ -411,19 +492,19 @@
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 1, <4 x i1> undef, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 1, <2 x i1> undef, <2 x float> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -445,12 +526,12 @@
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -489,63 +570,90 @@
 }
 
 define i32 @masked_scatter() {
-; SSE-LABEL: 'masked_scatter'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 1, <1 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef)
@@ -553,19 +661,19 @@
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -587,12 +695,12 @@
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
@@ -631,86 +739,140 @@
 }
 
 define i32 @masked_expandload() {
-; SSE-LABEL: 'masked_expandload'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; KNL-LABEL: 'masked_expandload'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_expandload'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef)
   %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef)
@@ -746,86 +908,140 @@
 }
 
 define i32 @masked_compressstore() {
-; SSE-LABEL: 'masked_compressstore'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 125 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 311 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 155 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_compressstore'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; KNL-LABEL: 'masked_compressstore'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_compressstore'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef)
   call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef)
@@ -863,7 +1079,7 @@
 define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
@@ -887,10 +1103,15 @@
 }
 
 define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
-; SSE-LABEL: 'test2'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -908,10 +1129,15 @@
 }
 
 define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
-; SSE-LABEL: 'test3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -929,10 +1155,15 @@
 }
 
 define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
-; SSE-LABEL: 'test4'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -960,10 +1191,15 @@
 }
 
 define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
-; SSE-LABEL: 'test5'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -981,10 +1217,15 @@
 }
 
 define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
-; SSE-LABEL: 'test6'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1002,10 +1243,15 @@
 }
 
 define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
-; SSE-LABEL: 'test7'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1023,10 +1269,15 @@
 }
 
 define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
-; SSE-LABEL: 'test8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1044,9 +1295,13 @@
 }
 
 define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
-; SSE-LABEL: 'test_gather_2f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1069,9 +1324,13 @@
 }
 
 define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
-; SSE-LABEL: 'test_gather_4i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1098,9 +1357,13 @@
 }
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0)  {
-; SSE-LABEL: 'test_gather_4i32_const_mask'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1130,7 +1393,7 @@
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
@@ -1142,13 +1405,13 @@
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
@@ -1174,7 +1437,7 @@
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
@@ -1186,13 +1449,13 @@
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
@@ -1218,7 +1481,7 @@
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -1230,13 +1493,13 @@
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
@@ -1260,11 +1523,11 @@
 
 define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask2'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -1280,7 +1543,7 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -1288,7 +1551,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -1318,20 +1581,28 @@
 }
 
 define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
-; SSE-LABEL: 'test_scatter_16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -1339,7 +1610,7 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -1347,7 +1618,7 @@
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -1368,12 +1639,16 @@
 }
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
-; SSE-LABEL: 'test_scatter_8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
@@ -1385,9 +1660,13 @@
 }
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
-; SSE-LABEL: 'test_scatter_4i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
@@ -1409,7 +1688,7 @@
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
@@ -1459,7 +1738,7 @@
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -38,7 +38,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -51,7 +51,7 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 5, i32 6, i32 7, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -90,7 +90,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -102,7 +102,7 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -358,7 +358,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -382,7 +382,7 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -417,7 +417,7 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -441,7 +441,7 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -476,7 +476,7 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -500,7 +500,7 @@
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -709,9 +709,9 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -733,9 +733,9 @@
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
@@ -828,9 +828,9 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -852,9 +852,9 @@
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -51,11 +51,11 @@
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -109,11 +109,11 @@
 ; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll
--- a/llvm/test/Analysis/CostModel/X86/sitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll
@@ -116,7 +116,7 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sitofp_i64_double'
@@ -241,8 +241,8 @@
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'sitofp_i64_float'
diff --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
--- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
@@ -34,12 +34,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_double'
@@ -48,12 +48,12 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = extractelement <2 x double> undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = extractelement <4 x double> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = extractelement <4 x double> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_3 = extractelement <4 x double> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = extractelement <8 x double> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = extractelement <8 x double> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_3 = extractelement <8 x double> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = extractelement <8 x double> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_7 = extractelement <8 x double> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f64_a = extractelement <2 x double> undef, i32 %arg
@@ -104,12 +104,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_float'
@@ -123,12 +123,12 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = extractelement <8 x float> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = extractelement <8 x float> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = extractelement <8 x float> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = extractelement <8 x float> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = extractelement <16 x float> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = extractelement <16 x float> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = extractelement <16 x float> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = extractelement <16 x float> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = extractelement <16 x float> undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f32_a = extractelement <2 x float> undef, i32 %arg
@@ -158,43 +158,43 @@
 ; SSE2-LABEL: 'extract_i64'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i64'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i64'
@@ -217,12 +217,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_i64'
@@ -231,12 +231,12 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i64'
@@ -288,58 +288,58 @@
 ; SSE2-LABEL: 'extract_i32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i32'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i32'
@@ -371,13 +371,13 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'extract_i32'
@@ -390,13 +390,13 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i32'
@@ -544,36 +544,55 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX512-LABEL: 'extract_i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-LABEL: 'extract_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'extract_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
@@ -639,73 +658,73 @@
 ; SSE2-LABEL: 'extract_i8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'extract_i8'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'extract_i8'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE4-LABEL: 'extract_i8'
@@ -742,43 +761,67 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX512-LABEL: 'extract_i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512F-LABEL: 'extract_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'extract_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'extract_i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert.ll b/llvm/test/Analysis/CostModel/X86/vector-insert.ll
--- a/llvm/test/Analysis/CostModel/X86/vector-insert.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert.ll
@@ -9,8 +9,8 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,GLM
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,BTVER2
 
 define i32 @insert_double(i32 %arg) {
@@ -34,12 +34,12 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'insert_double'
@@ -48,12 +48,12 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> undef, double undef, i32 1
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_a = insertelement <4 x double> undef, double undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> undef, double undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> undef, double undef, i32 3
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_a = insertelement <8 x double> undef, double undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> undef, double undef, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> undef, double undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_4 = insertelement <8 x double> undef, double undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> undef, double undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2f64_a = insertelement <2 x double> undef, double undef, i32 %arg
@@ -74,24 +74,81 @@
 }
 
 define i32 @insert_float(i32 %arg) {
-; SSE-LABEL: 'insert_float'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_float'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_float'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_float'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_float'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'insert_float'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
@@ -103,13 +160,13 @@
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'insert_float'
@@ -122,15 +179,53 @@
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'insert_float'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_float'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_a = insertelement <4 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_a = insertelement <8 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> undef, float undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> undef, float undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_a = insertelement <16 x float> undef, float undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> undef, float undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> undef, float undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> undef, float undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> undef, float undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
   %v2f32_a = insertelement <2 x float> undef, float undef, i32 %arg
   %v2f32_0 = insertelement <2 x float> undef, float undef, i32 0
   %v2f32_1 = insertelement <2 x float> undef, float undef, i32 1
@@ -155,19 +250,117 @@
 }
 
 define i32 @insert_i64(i32 %arg) {
-; CHECK-LABEL: 'insert_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i64'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i64'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> undef, i64 undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = insertelement <4 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> undef, i64 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = insertelement <8 x i64> undef, i64 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> undef, i64 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> undef, i64 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> undef, i64 undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> undef, i64 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2i64_a = insertelement <2 x i64> undef, i64 undef, i32 %arg
   %v2i64_0 = insertelement <2 x i64> undef, i64 undef, i32 0
@@ -187,24 +380,157 @@
 }
 
 define i32 @insert_i32(i32 %arg) {
-; CHECK-LABEL: 'insert_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i32'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i32'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> undef, i32 undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = insertelement <4 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = insertelement <8 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> undef, i32 undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> undef, i32 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = insertelement <16 x i32> undef, i32 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> undef, i32 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> undef, i32 undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> undef, i32 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> undef, i32 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v2i32_a = insertelement <2 x i32> undef, i32 undef, i32 %arg
   %v2i32_0 = insertelement <2 x i32> undef, i32 undef, i32 0
@@ -230,24 +556,81 @@
 }
 
 define i32 @insert_i16(i32 %arg) {
-; CHECK-LABEL: 'insert_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'insert_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'insert_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'insert_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg
   %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0
@@ -272,29 +655,221 @@
 }
 
 define i32 @insert_i8(i32 %arg) {
-; CHECK-LABEL: 'insert_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'insert_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i8'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i8'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'insert_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'insert_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %v16i8_a  = insertelement <16 x i8> undef, i8 undef, i32 %arg
   %v16i8_0  = insertelement <16 x i8> undef, i8 undef, i32 0
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -455,11 +455,17 @@
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v2i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
 ; AVX-LABEL: 'splatvar_shift_v2i64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
@@ -486,11 +492,17 @@
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -535,11 +547,17 @@
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = ashr <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -584,11 +602,35 @@
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -597,11 +639,17 @@
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -646,11 +694,17 @@
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -861,7 +915,7 @@
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %shift = ashr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -898,7 +952,7 @@
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -953,7 +1007,7 @@
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -467,11 +467,35 @@
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-LABEL: 'splatvar_shift_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v2i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v2i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v2i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <2 x i64> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
   %insert = insertelement <2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -480,11 +504,17 @@
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -529,11 +559,17 @@
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -578,11 +614,35 @@
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -591,11 +651,17 @@
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -640,11 +706,17 @@
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -855,7 +927,7 @@
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %shift = lshr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -892,7 +964,7 @@
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %shift = lshr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -947,7 +1019,7 @@
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -459,11 +459,35 @@
 ;
 
 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, i64 %b) {
-; CHECK-LABEL: 'splatvar_shift_v2i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v2i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v2i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v2i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v2i64'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v2i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <2 x i64> undef, i64 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <2 x i64> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %shift
 ;
   %insert = insertelement <2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -472,11 +496,17 @@
 }
 
 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <4 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
@@ -521,11 +551,17 @@
 }
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <8 x i64> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
@@ -570,11 +606,35 @@
 }
 
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: 'splatvar_shift_v4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX-LABEL: 'splatvar_shift_v4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; XOP-LABEL: 'splatvar_shift_v4i32'
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
+;
+; AVX512-LABEL: 'splatvar_shift_v4i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <4 x i32> undef, i32 %b, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <4 x i32> %a, %splat
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %shift
 ;
   %insert = insertelement <4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -583,11 +643,17 @@
 }
 
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <8 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
@@ -632,11 +698,17 @@
 }
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
-; SSE-LABEL: 'splatvar_shift_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+; SSE2-LABEL: 'splatvar_shift_v16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
+;
+; SSE42-LABEL: 'splatvar_shift_v16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i32> %a, %splat
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
@@ -847,7 +919,7 @@
 
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %shift = shl <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
@@ -884,7 +956,7 @@
 
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %shift = shl <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
@@ -939,7 +1011,7 @@
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %shift = shl <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,37 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLOW
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SLOW
 
 define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
 ; NORMAL-LABEL: @foo(
 ; NORMAL-NEXT:  entry:
-; NORMAL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NORMAL:       vector.ph:
-; NORMAL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NORMAL:       vector.body:
-; NORMAL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NORMAL-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; NORMAL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; NORMAL-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
-; NORMAL-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; NORMAL-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; NORMAL-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; NORMAL-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
-; NORMAL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; NORMAL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; NORMAL-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; NORMAL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; NORMAL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; NORMAL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; NORMAL:       middle.block:
-; NORMAL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; NORMAL:       scalar.ph:
 ; NORMAL-NEXT:    br label [[FOR_BODY:%.*]]
 ; NORMAL:       for.cond.cleanup:
 ; NORMAL-NEXT:    ret void
 ; NORMAL:       for.body:
-; NORMAL-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
+; NORMAL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NORMAL-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; NORMAL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; NORMAL-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; NORMAL-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; NORMAL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; NORMAL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; NORMAL-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; NORMAL-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; NORMAL-NEXT:    store i32 [[ADD4]], i32* [[ARRAYIDX6]], align 4
+; NORMAL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NORMAL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; NORMAL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+; AVX-LABEL: @foo(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
+; AVX-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
+; AVX-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX:       middle.block:
+; AVX-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.cond.cleanup:
+; AVX-NEXT:    ret void
+; AVX:       for.body:
+; AVX-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2
 ;
 ; SLOW-LABEL: @foo(
 ; SLOW-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -18,77 +18,59 @@
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa !1
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7
-; CHECK-NEXT:    [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP18]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP21]] = add <4 x i32> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP37]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP37]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 96
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP21]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, 100
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa !1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa !1
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP25]], [[TMP24]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
@@ -5,13 +5,15 @@
 ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
-; CHECK-NEXT:    [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
-; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
-; CHECK-NEXT:    [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
-; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> undef, half [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: @add_sub_v8i32(
@@ -101,44 +101,11 @@
 }
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
-; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @ashr_shl_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX512-LABEL: @ashr_shl_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-LABEL: @ashr_shl_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -184,14 +151,6 @@
 ; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
-; SLM-LABEL: @ashr_shl_v8i32_const(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
@@ -243,66 +202,31 @@
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
 ; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
 ; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
-; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
-; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
-; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
-; SLM-LABEL: @ashr_lshr_shl_v8i32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
-; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SLM-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; SLM-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -1,15 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer < %s -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 ; Verify that the SLP vectorizer is able to figure out that commutativity
 ; offers the possibility to splat/broadcast %c and thus make it profitable
 ; to vectorize this case
 
-
-; ModuleID = 'bugpoint-reduced-simplified.bc'
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
 @cle = external unnamed_addr global [32 x i8], align 16
 @cle32 = external unnamed_addr global [32 x i32], align 16
 
@@ -18,29 +15,64 @@
 ; commutativity property of `xor`.
 
 define void @splat(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
-; CHECK-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @splat(
+; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
+; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
+; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
+; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
+; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
+; SSE-NEXT:    store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
+; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
+; SSE-NEXT:    store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
+; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
+; SSE-NEXT:    store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
+; SSE-NEXT:    [[TMP9:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
+; SSE-NEXT:    [[TMP10:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
+; SSE-NEXT:    [[TMP11:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
+; SSE-NEXT:    [[TMP12:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
+; SSE-NEXT:    [[TMP13:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
+; SSE-NEXT:    [[TMP14:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
+; SSE-NEXT:    [[TMP15:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
+; SSE-NEXT:    [[TMP16:%.*]] = xor i8 [[A]], [[C]]
+; SSE-NEXT:    store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @splat(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
+; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; AVX-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
+; AVX-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
+; AVX-NEXT:    ret void
 ;
   %1 = xor i8 %c, %a
   store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
@@ -83,22 +115,37 @@
 ; leveraging the commutativity property of `xor`.
 
 define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: @same_opcode_on_one_side(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @same_opcode_on_one_side(
+; SSE-NEXT:    [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
+; SSE-NEXT:    [[ADD2:%.*]] = add i32 [[C]], [[A]]
+; SSE-NEXT:    [[ADD3:%.*]] = add i32 [[A]], [[C]]
+; SSE-NEXT:    [[ADD4:%.*]] = add i32 [[C]], [[A]]
+; SSE-NEXT:    [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
+; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
+; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
+; SSE-NEXT:    [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
+; SSE-NEXT:    store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
+; SSE-NEXT:    [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
+; SSE-NEXT:    store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @same_opcode_on_one_side(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
+; AVX-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
+; AVX-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
+; AVX-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
+; AVX-NEXT:    ret void
 ;
   %add1 = add i32 %c, %a
   %add2 = add i32 %c, %a
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -221,14 +221,10 @@
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R07]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
@@ -288,33 +284,11 @@
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[R03]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v4i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -336,33 +310,11 @@
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R07]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @test_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -221,14 +221,10 @@
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R07]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
@@ -288,33 +284,11 @@
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[R03]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v4i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -336,33 +310,11 @@
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R07]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @test_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -214,77 +214,44 @@
 
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
-; BIGTHRESH-LABEL: @simple_select_no_users(
-; BIGTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; BIGTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; BIGTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; BIGTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; BIGTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; BIGTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; BIGTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; BIGTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; BIGTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; BIGTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; BIGTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; BIGTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; BIGTHRESH-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
-; BIGTHRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; BIGTHRESH-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
-; BIGTHRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; BIGTHRESH-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; BIGTHRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; BIGTHRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; BIGTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
-; BIGTHRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; BIGTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; BIGTHRESH-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; BIGTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
-; BIGTHRESH-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; BIGTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
-; BIGTHRESH-NEXT:    ret <4 x float> [[RD]]
-;
-; ZEROTHRESH-LABEL: @simple_select_no_users(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
-; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
-; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
-; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
-; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; ZEROTHRESH-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
-; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
-; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2
-; ZEROTHRESH-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3
-; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+; ANY-LABEL: @simple_select_no_users(
+; ANY-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ANY-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ANY-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ANY-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ANY-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ANY-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ANY-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ANY-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ANY-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ANY-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ANY-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ANY-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; ANY-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; ANY-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; ANY-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; ANY-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; ANY-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; ANY-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; ANY-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; ANY-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; ANY-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; ANY-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; ANY-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; ANY-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; ANY-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
+; ANY-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; ANY-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
+; ANY-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; ANY-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; ANY-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
+; ANY-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; ANY-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
+; ANY-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
+; ANY-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
+; ANY-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
+; ANY-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; ANY-NEXT:    ret <4 x float> [[RD]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -11,21 +13,34 @@
 ; zero-extend the roots back to their original sizes.
 ;
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
-; CHECK-LABEL: @PR31243_zext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    ret i8 [[TMP8]]
+; SSE-LABEL: @PR31243_zext(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; SSE-NEXT:    ret i8 [[TMP8]]
+;
+; AVX-LABEL: @PR31243_zext(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; AVX-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX-NEXT:    ret i8 [[TMP8]]
 ;
 entry:
   %tmp0 = zext i8 %v0 to i32
@@ -54,22 +69,35 @@
 ;        ensure correctness.
 ;
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
-; CHECK-LABEL: @PR31243_sext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    ret i8 [[TMP8]]
+; SSE-LABEL: @PR31243_sext(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; SSE-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; SSE-NEXT:    ret i8 [[TMP8]]
+;
+; AVX-LABEL: @PR31243_sext(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX-NEXT:    ret i8 [[TMP8]]
 ;
 entry:
   %tmp0 = sext i8 %v0 to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -slp-vectorizer -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx  -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -slp-vectorizer -slp-vectorizer -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 
 %class.1 = type { %class.2 }
 %class.2 = type { %"class.3" }
@@ -10,23 +12,40 @@
 
 ; Function Attrs: uwtable
 define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
-; CHECK-LABEL: @_ZN1C10SwitchModeEv(
-; CHECK-NEXT:  for.body.lr.ph.i:
-; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
-; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
-; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
-; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @_ZN1C10SwitchModeEv(
+; SSE-NEXT:  for.body.lr.ph.i:
+; SSE-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; SSE-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; SSE-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; SSE-NEXT:    [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8
+; SSE-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8
+; SSE-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; SSE-NEXT:    [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]]
+; SSE-NEXT:    [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]]
+; SSE-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; SSE-NEXT:    store i64 [[AND_2]], i64* [[BAR3]], align 8
+; SSE-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; SSE-NEXT:    store i64 [[AND_1]], i64* [[BAR4]], align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @_ZN1C10SwitchModeEv(
+; AVX-NEXT:  for.body.lr.ph.i:
+; AVX-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; AVX-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; AVX-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; AVX-NEXT:    ret void
 ;
 for.body.lr.ph.i:
   %or.1 = or i64 undef, 1
@@ -47,34 +66,65 @@
 
 ; Function Attrs: norecurse nounwind uwtable
 define void @pr35497() local_unnamed_addr #0 {
-; CHECK-LABEL: @pr35497(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 undef, undef
-; CHECK-NEXT:    store i64 [[ADD]], i64* undef, align 1
-; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
-; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @pr35497(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; SSE-NEXT:    [[AND:%.*]] = shl i64 [[TMP0]], 2
+; SSE-NEXT:    [[SHL:%.*]] = and i64 [[AND]], 20
+; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; SSE-NEXT:    [[AND_1:%.*]] = shl i64 undef, 2
+; SSE-NEXT:    [[SHL_1:%.*]] = and i64 [[AND_1]], 20
+; SSE-NEXT:    [[SHR_1:%.*]] = lshr i64 undef, 6
+; SSE-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]]
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; SSE-NEXT:    [[SHR_2:%.*]] = lshr i64 undef, 6
+; SSE-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]]
+; SSE-NEXT:    [[AND_4:%.*]] = shl i64 [[ADD]], 2
+; SSE-NEXT:    [[SHL_4:%.*]] = and i64 [[AND_4]], 20
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; SSE-NEXT:    store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1
+; SSE-NEXT:    [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2
+; SSE-NEXT:    [[SHL_5:%.*]] = and i64 [[AND_5]], 20
+; SSE-NEXT:    [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6
+; SSE-NEXT:    [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]]
+; SSE-NEXT:    store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; SSE-NEXT:    store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1
+; SSE-NEXT:    [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6
+; SSE-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]]
+; SSE-NEXT:    store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @pr35497(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; AVX-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
+; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], <i64 6, i64 6>
+; AVX-NEXT:    [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]]
+; AVX-NEXT:    [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1
+; AVX-NEXT:    ret void
 ;
 entry:
   %0 = load i64, i64* undef, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -110,7 +110,7 @@
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
-  ; YAML-NEXT:   - Cost:            '-8'
+  ; YAML-NEXT:   - Cost:            '-5'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '4'
 
@@ -120,7 +120,7 @@
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-  ; YAML-NEXT:   - Cost:            '-4'
+  ; YAML-NEXT:   - Cost:            '-7'
   ; YAML-NEXT:   - String:          ' and with tree size '
   ; YAML-NEXT:   - TreeSize:        '1'
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -12,70 +12,84 @@
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8
+; CHECK-NEXT:    [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1
+; CHECK-NEXT:    store i8 [[CONV_I_I1199]], i8* [[TMP0]], align 1
+; CHECK-NEXT:    [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8
+; CHECK-NEXT:    [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    store i8 [[CONV_1_I_I]], i8* [[ARRAYIDX_I_I7_1_I_I]], align 1
+; CHECK-NEXT:    [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8
+; CHECK-NEXT:    [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    store i8 [[CONV_2_I_I]], i8* [[ARRAYIDX_I_I7_2_I_I]], align 1
+; CHECK-NEXT:    [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8
+; CHECK-NEXT:    [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+; CHECK-NEXT:    store i8 [[CONV_3_I_I]], i8* [[ARRAYIDX_I_I7_3_I_I]], align 1
+; CHECK-NEXT:    [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8
+; CHECK-NEXT:    [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+; CHECK-NEXT:    store i8 [[CONV_4_I_I]], i8* [[ARRAYIDX_I_I7_4_I_I]], align 1
+; CHECK-NEXT:    [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8
+; CHECK-NEXT:    [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+; CHECK-NEXT:    store i8 [[CONV_5_I_I]], i8* [[ARRAYIDX_I_I7_5_I_I]], align 1
+; CHECK-NEXT:    [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8
+; CHECK-NEXT:    [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    store i8 [[CONV_6_I_I]], i8* [[ARRAYIDX_I_I7_6_I_I]], align 1
+; CHECK-NEXT:    [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8
+; CHECK-NEXT:    [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    store i8 [[CONV_7_I_I]], i8* [[ARRAYIDX_I_I7_7_I_I]], align 1
+; CHECK-NEXT:    [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8
+; CHECK-NEXT:    [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+; CHECK-NEXT:    store i8 [[CONV_8_I_I]], i8* [[ARRAYIDX_I_I7_8_I_I]], align 1
+; CHECK-NEXT:    [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8
+; CHECK-NEXT:    [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+; CHECK-NEXT:    store i8 [[CONV_9_I_I]], i8* [[ARRAYIDX_I_I7_9_I_I]], align 1
+; CHECK-NEXT:    [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8
+; CHECK-NEXT:    [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    store i8 [[CONV_10_I_I]], i8* [[ARRAYIDX_I_I7_10_I_I]], align 1
+; CHECK-NEXT:    [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8
+; CHECK-NEXT:    [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    store i8 [[CONV_11_I_I]], i8* [[ARRAYIDX_I_I7_11_I_I]], align 1
+; CHECK-NEXT:    [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8
+; CHECK-NEXT:    [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+; CHECK-NEXT:    store i8 [[CONV_12_I_I]], i8* [[ARRAYIDX_I_I7_12_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8
+; CHECK-NEXT:    [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    store i8 [[CONV_13_I_I]], i8* [[ARRAYIDX_I_I7_13_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8
+; CHECK-NEXT:    [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+; CHECK-NEXT:    store i8 [[CONV_14_I_I]], i8* [[ARRAYIDX_I_I7_14_I_I]], align 1
 ; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8
+; CHECK-NEXT:    [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    store i8 [[CONV_15_I_I]], i8* [[ARRAYIDX_I_I7_15_I_I]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -43,23 +43,40 @@
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE-LABEL: @loadext_4i8_to_4i32(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -115,58 +132,22 @@
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i8_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i8_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i8_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -805,58 +786,22 @@
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i16_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i16_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i16_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1058,58 +1003,22 @@
 ; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
 ; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX1-LABEL: @loadext_4i32_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX2-LABEL: @loadext_4i32_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX512-LABEL: @loadext_4i32_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i32_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -11,15 +11,26 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE-LABEL: @loadext_2i8_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -114,76 +125,57 @@
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE-LABEL: @loadext_4i8_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i8_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i8_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i8_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -701,15 +693,26 @@
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE-LABEL: @loadext_2i16_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -804,76 +807,57 @@
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE-LABEL: @loadext_4i16_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i16_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i16_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i16_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1025,15 +1009,26 @@
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE-LABEL: @loadext_2i32_to_2i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1057,76 +1052,57 @@
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE-LABEL: @loadext_4i32_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
-;
-; AVX1-LABEL: @loadext_4i32_to_4i64(
-; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX1-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; AVX1-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX1-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX2-LABEL: @loadext_4i32_to_4i64(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX2-NEXT:    ret <4 x i64> [[V3]]
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SLM-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SLM-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
 ;
-; AVX512-LABEL: @loadext_4i32_to_4i64(
-; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX512-NEXT:    ret <4 x i64> [[V3]]
+; AVX-LABEL: @loadext_4i32_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2