diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9588,6 +9588,26 @@
     return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
   }
 
+  // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
+  // iff x >= bw/2 (i.e. lower half is known zero)
+  unsigned BW = VT.getScalarSizeInBits();
+  if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() >= (BW / 2) &&
+        (ShAmt->getZExtValue() % 16) == 0 && TLI.isTruncateFree(VT, HalfVT) &&
+        (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
+      SDValue Res = N0.getOperand(0);
+      if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
+      Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
+      Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
+      return DAG.getZExtOrTrunc(Res, DL, VT);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
--- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
@@ -442,8 +442,8 @@
 ; CHECK-LABEL: zext_load_i32_by_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    rev w0, w8
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    lsr w0, w8, #16
 ; CHECK-NEXT:    ret
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -499,8 +499,8 @@
 ; CHECK-LABEL: zext_load_i32_by_i8_bswap:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    rev w0, w8
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    lsr w0, w8, #16
 ; CHECK-NEXT:    ret
 
   %tmp = bitcast i32* %arg to i8*
diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -87,17 +87,16 @@
 define i64 @test_bswap64_shift48_zext(i16 %a0) {
 ; X86-LABEL: test_bswap64_shift48_zext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bswap64_shift48_zext:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shlq $48, %rax
-; X64-NEXT:    bswapq %rax
+; X64-NEXT:    rolw $8, %di
+; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    retq
   %z = zext i16 %a0 to i64
   %s = shl i64 %z, 48
@@ -109,16 +108,15 @@
 ; X86-LABEL: test_bswap64_shift48:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    bswapl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_bswap64_shift48:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $48, %rax
-; X64-NEXT:    bswapq %rax
+; X64-NEXT:    rolw $8, %di
+; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    retq
   %s = shl i64 %a0, 48
   %b = call i64 @llvm.bswap.i64(i64 %s)
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -1209,20 +1209,33 @@
 ; i8* p;
 ; (i32) p[1] | ((i32) p[0] << 8)
 define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
-; CHECK-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzwl (%eax), %eax
-; CHECK-NEXT:    shll $16, %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    retl
+; BSWAP-LABEL: zext_load_i32_by_i8_bswap:
+; BSWAP:       # %bb.0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movzwl (%eax), %eax
+; BSWAP-NEXT:    rolw $8, %ax
+; BSWAP-NEXT:    movzwl %ax, %eax
+; BSWAP-NEXT:    retl
 ;
-; CHECK64-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movzwl (%rdi), %eax
-; CHECK64-NEXT:    shll $16, %eax
-; CHECK64-NEXT:    bswapl %eax
-; CHECK64-NEXT:    retq
+; MOVBE-LABEL: zext_load_i32_by_i8_bswap:
+; MOVBE:       # %bb.0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbew (%eax), %ax
+; MOVBE-NEXT:    movzwl %ax, %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: zext_load_i32_by_i8_bswap:
+; BSWAP64:       # %bb.0:
+; BSWAP64-NEXT:    movzwl (%rdi), %eax
+; BSWAP64-NEXT:    rolw $8, %ax
+; BSWAP64-NEXT:    movzwl %ax, %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: zext_load_i32_by_i8_bswap:
+; MOVBE64:       # %bb.0:
+; MOVBE64-NEXT:    movbew (%rdi), %ax
+; MOVBE64-NEXT:    movzwl %ax, %eax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
   %tmp2 = load i8, i8* %tmp1, align 1