diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -514,6 +514,10 @@
         "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
         "Prefer a left/right vector logical shift pair over a shift+and pair">;
 
+def FeatureFastMOVBE
+    : SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
+    "Prefer a movbe over a single-use load + bswap / single-use bswap + store">;
+
 def FeatureUseGLMDivSqrtCosts
     : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
         "Use Goldmont specific floating point div/sqrt costs">;
@@ -820,6 +824,7 @@
                                       FeatureSlowDivide64,
                                       FeatureSlowPMULLD,
                                       FeatureFast7ByteNOP,
+                                      FeatureFastMOVBE,
                                       FeaturePOPCNTFalseDeps,
                                       FeatureInsertVZEROUPPER];
   list<SubtargetFeature> SLMFeatures =
@@ -839,6 +844,7 @@
                                       FeatureSlowTwoMemOps,
                                       FeatureSlowLEA,
                                       FeatureSlowIncDec,
+                                      FeatureFastMOVBE,
                                       FeaturePOPCNTFalseDeps,
                                       FeatureInsertVZEROUPPER];
   list<SubtargetFeature> GLMFeatures =
@@ -851,6 +857,7 @@
                                       FeatureSlowTwoMemOps,
                                       FeatureSlowLEA,
                                       FeatureSlowIncDec,
+                                      FeatureFastMOVBE,
                                       FeatureInsertVZEROUPPER];
   list<SubtargetFeature> GLPFeatures =
     !listconcat(GLMFeatures, GLPAdditionalFeatures);
@@ -924,6 +931,7 @@
                                       FeatureSlowTwoMemOps,
                                       FeaturePreferMaskRegisters,
                                       FeatureHasFastGather,
+                                      FeatureFastMOVBE,
                                       FeatureSlowPMADDWD];
   // TODO Add AVX5124FMAPS/AVX5124VNNIW features
   list<SubtargetFeature> KNMFeatures =
@@ -983,6 +991,7 @@
                                          FeatureFast15ByteNOP,
                                          FeatureFastScalarShiftMasks,
                                          FeatureFastVectorShiftMasks,
+                                         FeatureFastMOVBE,
                                          FeatureSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1017,7 +1026,9 @@
                                                      FeatureTBM,
                                                      FeatureFMA,
                                                      FeatureFastBEXTR];
-  list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+  list<SubtargetFeature> BdVer2AdditionalTuning = [FeatureFastMOVBE];
+  list<SubtargetFeature> BdVer2Tuning =
+    !listconcat(BdVer1Tuning, BdVer2AdditionalTuning);
   list<SubtargetFeature> BdVer2Features =
     !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
 
@@ -1077,6 +1088,7 @@
                                      FeatureFast15ByteNOP,
                                      FeatureBranchFusion,
                                      FeatureFastScalarShiftMasks,
+                                     FeatureFastMOVBE,
                                      FeatureSlowSHLD,
                                      FeatureInsertVZEROUPPER];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -433,6 +433,9 @@
   /// Prefer a left/right vector logical shifts pair over a shift+and pair.
   bool HasFastVectorShiftMasks = false;
 
+  /// Prefer a movbe over a single-use load + bswap / single-use bswap + store.
+  bool HasFastMOVBE = false;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpolineIndirectCalls = false;
@@ -714,6 +717,7 @@
   bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
   bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; }
   bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
+  bool hasFastMOVBE() const { return HasFastMOVBE; }
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasBranchFusion() const { return HasBranchFusion; }
   bool hasERMSB() const { return HasERMSB; }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2695,6 +2695,7 @@
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
     { ISD::ABS,        MVT::i64,     2 }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,    14 },
+    { ISD::BSWAP,      MVT::i64,     1 },
     { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTTZ,       MVT::i64,     3 }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTPOP,      MVT::i64,    10 },
@@ -2708,6 +2709,8 @@
     { ISD::BITREVERSE, MVT::i32,    14 },
     { ISD::BITREVERSE, MVT::i16,    14 },
     { ISD::BITREVERSE, MVT::i8,     11 },
+    { ISD::BSWAP,      MVT::i32,     1 },
+    { ISD::BSWAP,      MVT::i16,     1 }, // ROL
     { ISD::CTLZ,       MVT::i32,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i16,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i8,      4 }, // BSR+XOR or BSR+XOR+CMOV
@@ -2919,6 +2922,17 @@
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
     }
 
+    if (ST->hasMOVBE() && ST->hasFastMOVBE()) {
+      if (const Instruction *II = ICA.getInst()) {
+        if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
+          return TTI::TCC_Free;
+        if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
+          if (LI->hasOneUse())
+            return TTI::TCC_Free;
+        }
+      }
+    }
+
     // TODO - add BMI (TZCNT) scalar handling
 
     if (ST->is64Bit())
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-store.ll b/llvm/test/Analysis/CostModel/X86/bswap-store.ll
--- a/llvm/test/Analysis/CostModel/X86/bswap-store.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-store.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze                           | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe             | FileCheck %s --check-prefixes=ALL,X64,SLOWMOVBE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X64,FASTMOVBE
+
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze                           | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe             | FileCheck %s --check-prefixes=ALL,X86,SLOWMOVBE
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X86,FASTMOVBE
 
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
@@ -11,14 +14,19 @@
 
 define void @var_bswap_store_i16(i16 %a, i16* %dst) {
 ; NOMOVBE-LABEL: 'var_bswap_store_i16'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; MOVBE-LABEL: 'var_bswap_store_i16'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SLOWMOVBE-LABEL: 'var_bswap_store_i16'
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; FASTMOVBE-LABEL: 'var_bswap_store_i16'
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
   store i16 %bswap, i16* %dst, align 1
@@ -26,17 +34,11 @@
   ret void
 }
 define void @var_bswap_store_i16_extrause(i16 %a, i16* %dst) {
-; NOMOVBE-LABEL: 'var_bswap_store_i16_extrause'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; MOVBE-LABEL: 'var_bswap_store_i16_extrause'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ALL-LABEL: 'var_bswap_store_i16_extrause'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %bswap, i16* %dst, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap2 = shl i16 %bswap, 2
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
   store i16 %bswap, i16* %dst, align 1
@@ -47,10 +49,20 @@
 }
 
 define void @var_bswap_store_i32(i32 %a, i32* %dst) {
-; ALL-LABEL: 'var_bswap_store_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; NOMOVBE-LABEL: 'var_bswap_store_i32'
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SLOWMOVBE-LABEL: 'var_bswap_store_i32'
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; FASTMOVBE-LABEL: 'var_bswap_store_i32'
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %bswap, i32* %dst, align 1
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %bswap = call i32 @llvm.bswap.i32(i32 %a)
   store i32 %bswap, i32* %dst, align 1
@@ -73,16 +85,6 @@
 }
 
 define void @var_bswap_store_i64(i64 %a, i64* %dst) {
-; X64-LABEL: 'var_bswap_store_i64'
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 %bswap, i64* %dst, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; X86-LABEL: 'var_bswap_store_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
   store i64 %bswap, i64* %dst, align 1
 
@@ -96,7 +98,7 @@
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; X86-LABEL: 'var_bswap_store_i64_extrause'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %bswap, i64* %dst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i64 %bswap, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -110,16 +112,6 @@
 }
 
 define void @var_bswap_store_i128(i128 %a, i128* %dst) {
-; X64-LABEL: 'var_bswap_store_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; X86-LABEL: 'var_bswap_store_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
   store i128 %bswap, i128* %dst, align 1
 
@@ -127,13 +119,13 @@
 }
 define void @var_bswap_store_i128_extrause(i128 %a, i128* %dst) {
 ; X64-LABEL: 'var_bswap_store_i128_extrause'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %bswap, i128* %dst, align 1
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap2 = shl i128 %bswap, 2
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; X86-LABEL: 'var_bswap_store_i128_extrause'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %bswap, i128* %dst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap2 = shl i128 %bswap, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll
--- a/llvm/test/Analysis/CostModel/X86/bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,X64
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X64
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,X86
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,X86
 
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
@@ -12,13 +12,9 @@
 ; Verify the cost of scalar bswap instructions.
 
 define i16 @var_bswap_i16(i16 %a) {
-; NOMOVBE-LABEL: 'var_bswap_i16'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
-;
-; MOVBE-LABEL: 'var_bswap_i16'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+; ALL-LABEL: 'var_bswap_i16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
   ret i16 %bswap
@@ -39,7 +35,7 @@
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
 ;
 ; X86-LABEL: 'var_bswap_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
 ;
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
@@ -48,11 +44,11 @@
 
 define i128 @var_bswap_i128(i128 %a) {
 ; X64-LABEL: 'var_bswap_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
 ; X86-LABEL: 'var_bswap_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
diff --git a/llvm/test/Analysis/CostModel/X86/load-bswap.ll b/llvm/test/Analysis/CostModel/X86/load-bswap.ll
--- a/llvm/test/Analysis/CostModel/X86/load-bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-bswap.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X64
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
-; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe | FileCheck %s --check-prefixes=ALL,MOVBE,X86
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze                           | FileCheck %s --check-prefixes=ALL,NOMOVBE,X64
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe             | FileCheck %s --check-prefixes=ALL,X64,SLOWMOVBE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X64,FASTMOVBE
+
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze                           | FileCheck %s --check-prefixes=ALL,NOMOVBE,X86
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe             | FileCheck %s --check-prefixes=ALL,X86,SLOWMOVBE
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -cost-model -analyze -mattr=+movbe,+fast-movbe | FileCheck %s --check-prefixes=ALL,X86,FASTMOVBE
 
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
@@ -12,13 +15,18 @@
 define i16 @var_load_bswap_i16(i16* %src) {
 ; NOMOVBE-LABEL: 'var_load_bswap_i16'
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
 ; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
-; MOVBE-LABEL: 'var_load_bswap_i16'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+; SLOWMOVBE-LABEL: 'var_load_bswap_i16'
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+;
+; FASTMOVBE-LABEL: 'var_load_bswap_i16'
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %a = load i16, i16* %src, align 1
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
@@ -26,19 +34,12 @@
   ret i16 %bswap
 }
 define i16 @var_load_bswap_i16_extrause(i16* %src, i16* %clobberdst) {
-; NOMOVBE-LABEL: 'var_load_bswap_i16_extrause'
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
-; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
-;
-; MOVBE-LABEL: 'var_load_bswap_i16_extrause'
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
-; MOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
+; ALL-LABEL: 'var_load_bswap_i16_extrause'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, i16* %src, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i16 @llvm.bswap.i16(i16 %a)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a2 = shl i16 %a, 2
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %a2, i16* %clobberdst, align 1
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bswap
 ;
   %a = load i16, i16* %src, align 1
   %bswap = call i16 @llvm.bswap.i16(i16 %a)
@@ -50,10 +51,20 @@
 }
 
 define i32 @var_load_bswap_i32(i32* %src) {
-; ALL-LABEL: 'var_load_bswap_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
+; NOMOVBE-LABEL: 'var_load_bswap_i32'
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; NOMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
+;
+; SLOWMOVBE-LABEL: 'var_load_bswap_i32'
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; SLOWMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
+;
+; FASTMOVBE-LABEL: 'var_load_bswap_i32'
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i32, i32* %src, align 1
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bswap = call i32 @llvm.bswap.i32(i32 %a)
+; FASTMOVBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bswap
 ;
   %a = load i32, i32* %src, align 1
   %bswap = call i32 @llvm.bswap.i32(i32 %a)
@@ -78,16 +89,6 @@
 }
 
 define i64 @var_load_bswap_i64(i64* %src) {
-; X64-LABEL: 'var_load_bswap_i64'
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i64, i64* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
-;
-; X86-LABEL: 'var_load_bswap_i64'
-; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
-;
   %a = load i64, i64* %src, align 1
   %bswap = call i64 @llvm.bswap.i64(i64 %a)
 
@@ -103,7 +104,7 @@
 ;
 ; X86-LABEL: 'var_load_bswap_i64_extrause'
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i64, i64* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i64 @llvm.bswap.i64(i64 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i64 %a, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 %a2, i64* %clobberdst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bswap
@@ -118,16 +119,6 @@
 }
 
 define i128 @var_load_bswap_i128(i128* %src) {
-; X64-LABEL: 'var_load_bswap_i128'
-; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
-;
-; X86-LABEL: 'var_load_bswap_i128'
-; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
-; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
-;
   %a = load i128, i128* %src, align 1
   %bswap = call i128 @llvm.bswap.i128(i128 %a)
 
@@ -136,14 +127,14 @@
 define i128 @var_load_bswap_i128_extrause(i128* %src, i128* %clobberdst) {
 ; X64-LABEL: 'var_load_bswap_i128_extrause'
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a = load i128, i128* %src, align 1
-; X64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2 = shl i128 %a, 2
 ; X64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i128 %a2, i128* %clobberdst, align 1
 ; X64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
 ;
 ; X86-LABEL: 'var_load_bswap_i128_extrause'
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a = load i128, i128* %src, align 1
-; X86-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
+; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bswap = call i128 @llvm.bswap.i128(i128 %a)
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2 = shl i128 %a, 2
 ; X86-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 %a2, i128* %clobberdst, align 1
 ; X86-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i128 %bswap
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
@@ -42,18 +42,30 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.abs.i64(i64 [[A0]], i1 false)
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.abs.i64(i64 [[A1]], i1 false)
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.abs.i64(i64 [[A2]], i1 false)
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.abs.i64(i64 [[A3]], i1 false)
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.abs.i64(i64 [[A4]], i1 false)
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.abs.i64(i64 [[A5]], i1 false)
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.abs.i64(i64 [[A6]], i1 false)
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.abs.i64(i64 [[A7]], i1 false)
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v8i64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -62,40 +62,31 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
 ; AVX1-LABEL: @add_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
@@ -114,7 +105,6 @@
 ; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
-;
 ; AVX2-LABEL: @add_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -125,14 +115,6 @@
 ; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @add_v8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
-;
 ; AVX256BW-LABEL: @add_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -143,7 +125,6 @@
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
-;
   %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
   %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
   %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -26,41 +26,6 @@
 declare i8  @llvm.uadd.sat.i8 (i8 , i8 )
 
 define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
-;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -62,40 +62,31 @@
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
 ; AVX1-LABEL: @sub_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
 ; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
@@ -114,7 +105,6 @@
 ; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
-;
 ; AVX2-LABEL: @sub_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -125,14 +115,6 @@
 ; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @sub_v8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
-;
 ; AVX256BW-LABEL: @sub_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -143,7 +125,6 @@
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
-;
   %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
   %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
   %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -26,41 +26,6 @@
 declare i8  @llvm.usub.sat.i8 (i8 , i8 )
 
 define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
-; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
-; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
-; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
-; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
-; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
-; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
-; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
-; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
-; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
-; SSE-NEXT:    ret void
-;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -22,11 +22,17 @@
 declare  i8 @llvm.bitreverse.i8(i8)
 
 define void @bitreverse_2i64() #0 {
-; CHECK-LABEL: @bitreverse_2i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @bitreverse_2i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; XOP-LABEL: @bitreverse_2i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; XOP-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; XOP-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8