diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2695,6 +2695,7 @@
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
     { ISD::ABS,        MVT::i64,     2 }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,    14 },
+    { ISD::BSWAP,      MVT::i64,     1 },
     { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTTZ,       MVT::i64,     3 }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTPOP,      MVT::i64,    10 },
@@ -2708,6 +2709,8 @@
     { ISD::BITREVERSE, MVT::i32,    14 },
     { ISD::BITREVERSE, MVT::i16,    14 },
     { ISD::BITREVERSE, MVT::i8,     11 },
+    { ISD::BSWAP,      MVT::i32,     1 },
+    { ISD::BSWAP,      MVT::i16,     1 }, // MOVZX + ROL by 8
     { ISD::CTLZ,       MVT::i32,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i16,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i8,      4 }, // BSR+XOR or BSR+XOR+CMOV
@@ -2919,6 +2922,17 @@
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
     }
 
+    if (ST->hasMOVBE()) {
+      if (const Instruction *II = ICA.getInst()) {
+        if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
+          return TTI::TCC_Free;
+        if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
+          if (LI->hasOneUse())
+            return TTI::TCC_Free;
+        }
+      }
+    }
+
     // TODO - add BMI (TZCNT) scalar handling
 
     if (ST->is64Bit())
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-store.ll b/llvm/test/Analysis/CostModel/X86/bswap-store.ll
--- a/llvm/test/Analysis/CostModel/X86/bswap-store.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-store.ll
@@ -11,12 +11,12 @@
 
 define void @var_bswap_store_i16(i16 %a, i16* %dst) {
 ; NOMOVBE-LABEL: 'var_bswap_store_i16'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MOVBE-LABEL: 'var_bswap_store_i16'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
 ; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -26,17 +26,11 @@
   ret void
 }
 define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) {
-; NOMOVBE-LABEL: 'var_bswap_store_i16_extrause'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; MOVBE-LABEL: 'var_bswap_store_i16_extrause'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ALL-LABEL: 'var_bswap_store_i16_extrause'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
   store i16 %bswap, i16* %dst, align 1
@@ -47,10 +41,15 @@
 }
 
 define void @var_bswap_store_i32(i32 %a, i32* %dst) {
-; ALL-LABEL: 'var_bswap_store_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; NOMOVBE-LABEL: 'var_bswap_store_i32'
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; MOVBE-LABEL: 'var_bswap_store_i32'
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %bswap = call i32 @llvm.bswap.i32(i32 %a)
   store i32 %bswap, i32* %dst, align 1
@@ -73,16 +72,6 @@
 }
 
 define void @var_bswap_store_i64(i64 %a, i64* %dst) {
-; X64-LABEL: 'var_bswap_store_i64'
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; X86-LABEL: 'var_bswap_store_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
   store i64 %bswap, i64* %dst, align 1
 
@@ -96,7 +85,7 @@
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; X86-LABEL: 'var_bswap_store_i64_extrause'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -110,16 +99,6 @@
 }
 
 define void @var_bswap_store_i128(i128 %a, i128* %dst) {
-; X64-LABEL: 'var_bswap_store_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; X86-LABEL: 'var_bswap_store_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
   store i128 %bswap, i128* %dst, align 1
 
@@ -127,13 +106,13 @@
 }
 define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) {
 ; X64-LABEL: 'var_bswap_store_i128_extrause'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; X86-LABEL: 'var_bswap_store_i128_extrause'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll
--- a/llvm/test/Analysis/CostModel/X86/bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap.ll
@@ -12,13 +12,9 @@
 ; Verify the cost of scalar bswap instructions.
 
 define i16 @var_bswap_i16(i16 %a) {
-; NOMOVBE-LABEL: 'var_bswap_i16'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
-;
-; MOVBE-LABEL: 'var_bswap_i16'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+; ALL-LABEL: 'var_bswap_i16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
   ret i16 %bswap
@@ -39,7 +35,7 @@
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
 ;
 ; X86-LABEL: 'var_bswap_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
 ;
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
@@ -48,11 +44,11 @@
 
 define i128 @var_bswap_i128(i128 %a) {
 ; X64-LABEL: 'var_bswap_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
 ; X86-LABEL: 'var_bswap_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
diff --git a/llvm/test/Analysis/CostModel/X86/load-bswap.ll b/llvm/test/Analysis/CostModel/X86/load-bswap.ll
--- a/llvm/test/Analysis/CostModel/X86/load-bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-bswap.ll
@@ -12,12 +12,12 @@
 define i16 @var_load_bswap_i16(i16* %src) {
 ; NOMOVBE-LABEL: 'var_load_bswap_i16'
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
 ; MOVBE-LABEL: 'var_load_bswap_i16'
 ; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %a = load i16, i16* %src, align 1
@@ -26,19 +26,12 @@
   ret i16 %bswap
 }
 define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) {
-; NOMOVBE-LABEL: 'var_load_bswap_i16_extrause'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
-;
-; MOVBE-LABEL: 'var_load_bswap_i16_extrause'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+; ALL-LABEL: 'var_load_bswap_i16_extrause'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %a = load i16, i16* %src, align 1
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
@@ -50,10 +43,15 @@
 }
 
 define i32 @var_load_bswap_i32(i32* %src) {
-; ALL-LABEL: 'var_load_bswap_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
+; NOMOVBE-LABEL: 'var_load_bswap_i32'
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
+;
+; MOVBE-LABEL: 'var_load_bswap_i32'
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
 ;
   %a = load i32, i32* %src, align 1
   %bswap = call i32 @llvm.bswap.i32(i32 %a)
@@ -78,16 +76,6 @@
 }
 
 define i64 @var_load_bswap_i64(i64* %src) {
-; X64-LABEL: 'var_load_bswap_i64'
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
-;
-; X86-LABEL: 'var_load_bswap_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
-;
   %a = load i64, i64* %src, align 1
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
 
@@ -103,7 +91,7 @@
 ;
 ; X86-LABEL: 'var_load_bswap_i64_extrause'
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
@@ -118,16 +106,6 @@
 }
 
 define i128 @var_load_bswap_i128(i128* %src) {
-; X64-LABEL: 'var_load_bswap_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
-;
-; X86-LABEL: 'var_load_bswap_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
-;
   %a = load i128, i128* %src, align 1
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
 
@@ -136,14 +114,14 @@
 define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) {
 ; X64-LABEL: 'var_load_bswap_i128_extrause'
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
 ; X86-LABEL: 'var_load_bswap_i128_extrause'
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
@@ -42,18 +42,30 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.abs.i64(i64 [[A0]], i1 false)
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.abs.i64(i64 [[A1]], i1 false)
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.abs.i64(i64 [[A2]], i1 false)
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.abs.i64(i64 [[A3]], i1 false)
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.abs.i64(i64 [[A4]], i1 false)
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.abs.i64(i64 [[A5]], i1 false)
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.abs.i64(i64 [[A6]], i1 false)
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.abs.i64(i64 [[A7]], i1 false)
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v8i64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -62,40 +62,31 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
 ; AVX1-LABEL: @add_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
@@ -114,7 +105,6 @@
 ; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
-;
 ; AVX2-LABEL: @add_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -125,14 +115,6 @@
 ; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @add_v8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
-;
 ; AVX256BW-LABEL: @add_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -143,7 +125,6 @@
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
-;
   %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
   %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
   %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -26,41 +26,6 @@
 declare i8  @llvm.uadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -62,40 +62,31 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
 ; AVX1-LABEL: @sub_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
@@ -114,7 +105,6 @@
 ; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
-;
 ; AVX2-LABEL: @sub_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -125,14 +115,6 @@
 ; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @sub_v8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
-;
 ; AVX256BW-LABEL: @sub_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -143,7 +125,6 @@
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
-;
   %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
   %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
   %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -26,41 +26,6 @@
 declare i8  @llvm.usub.sat.i8 (i8 , i8 )
 
 define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -22,11 +22,17 @@
 declare  i8 @llvm.bitreverse.i8(i8)
 
 define void @bitreverse_2i64() #0 {
-; CHECK-LABEL: @bitreverse_2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @bitreverse_2i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; XOP-LABEL: @bitreverse_2i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; XOP-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; XOP-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8