Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1349,6 +1349,23 @@
     Value *IIOperand = II->getArgOperand(0);
     Value *X = nullptr;
 
+    // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
+    // inverse-shift-of-bswap:
+    // bswap (shl X, C) --> lshr (bswap X), C
+    // bswap (lshr X, C) --> shl (bswap X), C
+    BinaryOperator *BO;
+    if (match(IIOperand, m_OneUse(m_BinOp(BO)))) {
+      const APInt *C;
+      if (match(BO, m_LogicalShift(m_Value(X), m_APIntAllowUndef(C))) &&
+          (*C & 7) == 0) {
+        Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X);
+        BinaryOperator::BinaryOps InverseShift =
+            BO->getOpcode() == Instruction::Shl ? Instruction::LShr
+                                                : Instruction::Shl;
+        return BinaryOperator::Create(InverseShift, NewSwap, BO->getOperand(1));
+      }
+    }
+
     KnownBits Known = computeKnownBits(IIOperand, 0, II);
     uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
     uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);
Index: llvm/test/Transforms/InstCombine/bswap-fold.ll
===================================================================
--- llvm/test/Transforms/InstCombine/bswap-fold.ll
+++ llvm/test/Transforms/InstCombine/bswap-fold.ll
@@ -26,8 +26,8 @@
 
 define i32 @lshr8_i32(i32 %x) {
 ; CHECK-LABEL: @lshr8_i32(
-; CHECK-NEXT:    [[S:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.bswap.i32(i32 [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = shl i32 [[TMP1]], 8
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %s = lshr i32 %x, 8
@@ -37,8 +37,8 @@
 
 define <2 x i32> @lshr16_v2i32(<2 x i32> %x) {
 ; CHECK-LABEL: @lshr16_v2i32(
-; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i32> [[X:%.*]], <i32 16, i32 16>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = shl <2 x i32> [[TMP1]], <i32 16, i32 16>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %s = lshr <2 x i32> %x, <i32 16, i32 16>
@@ -48,14 +48,16 @@
 
 define i32 @lshr24_i32(i32 %x) {
 ; CHECK-LABEL: @lshr24_i32(
-; CHECK-NEXT:    [[S:%.*]] = and i32 [[X:%.*]], -16777216
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], -16777216
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %s = lshr i32 %x, 24
   %r = call i32 @llvm.bswap.i32(i32 %s)
   ret i32 %r
 }
 
+; negative test - need shift-by-8-bit-multiple
+
 define i32 @lshr12_i32(i32 %x) {
 ; CHECK-LABEL: @lshr12_i32(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i32 [[X:%.*]], 12
@@ -67,6 +69,8 @@
   ret i32 %r
 }
 
+; negative test - uses
+
 define i32 @lshr8_i32_use(i32 %x, i32* %p) {
 ; CHECK-LABEL: @lshr8_i32_use(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i32 [[X:%.*]], 12
@@ -82,8 +86,8 @@
 
 define i64 @shl16_i64(i64 %x) {
 ; CHECK-LABEL: @shl16_i64(
-; CHECK-NEXT:    [[S:%.*]] = shl i64 [[X:%.*]], 16
-; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = lshr i64 [[TMP1]], 16
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %s = shl i64 %x, 16
@@ -91,10 +95,12 @@
   ret i64 %r
 }
 
+; poison vector element propagates
+
 define <2 x i64> @shl16_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: @shl16_v2i64(
-; CHECK-NEXT:    [[S:%.*]] = shl <2 x i64> [[X:%.*]], <i64 poison, i64 24>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i64> [[TMP1]], <i64 poison, i64 24>
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %s = shl <2 x i64> %x, <i64 poison, i64 24>
@@ -104,14 +110,16 @@
 
 define i64 @shl56_i64(i64 %x) {
 ; CHECK-LABEL: @shl56_i64(
-; CHECK-NEXT:    [[S:%.*]] = and i64 [[X:%.*]], 255
-; CHECK-NEXT:    ret i64 [[S]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[X:%.*]], 255
+; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %s = shl i64 %x, 56
   %r = call i64 @llvm.bswap.i64(i64 %s)
   ret i64 %r
 }
 
+; negative test - need shift-by-8-bit-multiple
+
 define i64 @shl42_i64(i64 %x) {
 ; CHECK-LABEL: @shl42_i64(
 ; CHECK-NEXT:    [[S:%.*]] = shl i64 [[X:%.*]], 42
@@ -123,6 +131,8 @@
   ret i64 %r
 }
 
+; negative test - uses
+
 define i32 @shl8_i32_use(i32 %x, i32* %p) {
 ; CHECK-LABEL: @shl8_i32_use(
 ; CHECK-NEXT:    [[S:%.*]] = shl i32 [[X:%.*]], 8
@@ -136,11 +146,11 @@
   ret i32 %r
 }
 
+; swaps cancel
+
 define i64 @swap_shl16_i64(i64 %x) {
 ; CHECK-LABEL: @swap_shl16_i64(
-; CHECK-NEXT:    [[B:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]])
-; CHECK-NEXT:    [[S:%.*]] = shl i64 [[B]], 16
-; CHECK-NEXT:    [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]])
+; CHECK-NEXT:    [[R:%.*]] = lshr i64 [[X:%.*]], 16
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %b = call i64 @llvm.bswap.i64(i64 %x)
@@ -536,11 +546,11 @@
   ret <2 x i64> %3
 }
 
-; negative test
+; TODO: This should fold to 'and'.
 define <2 x i64> @bs_active_high_undef(<2 x i64> %0) {
 ; CHECK-LABEL: @bs_active_high_undef(
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], <i64 56, i64 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP0:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], <i64 56, i64 undef>
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %2 = shl <2 x i64> %0, <i64 56, i64 undef>