diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1806,6 +1806,35 @@
   }
   case ISD::BSWAP: {
     SDValue Src = Op.getOperand(0);
+
+    // If the only bits demanded come from one byte of the bswap result,
+    // just shift the input byte into position to eliminate the bswap.
+    unsigned NLZ = DemandedBits.countLeadingZeros();
+    unsigned NTZ = DemandedBits.countTrailingZeros();
+
+    // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+    // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+    // have 14 leading zeros, round to 8.
+    NLZ &= ~7;
+    NTZ &= ~7;
+    // If we need exactly one byte, we can do this transformation.
+    if (BitWidth - NLZ - NTZ == 8) {
+      unsigned ResultBit = NTZ;
+      unsigned InputBit = BitWidth - NTZ - 8;
+
+      // Replace this with either a left or right shift to get the byte into
+      // the right place.
+      unsigned ShiftOpcode = InputBit > ResultBit ? ISD::SRL : ISD::SHL;
+      if (!TLO.LegalOperations() || isOperationLegal(ShiftOpcode, VT)) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, DL);
+        unsigned ShiftAmount =
+            InputBit > ResultBit ? InputBit - ResultBit : ResultBit - InputBit;
+        SDValue ShAmt = TLO.DAG.getConstant(ShiftAmount, dl, ShiftAmtTy);
+        SDValue NewOp = TLO.DAG.getNode(ShiftOpcode, dl, VT, Src, ShAmt);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     APInt DemandedSrcBits = DemandedBits.byteSwap();
     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
                              Depth + 1))
diff --git a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
--- a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
+++ b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -66,7 +66,7 @@
 define i32 @demand_one_byte1(i32 %x) {
 ; CHECK-LABEL: demand_one_byte1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev w8, w0
+; CHECK-NEXT:    lsr w8, w0, #8
 ; CHECK-NEXT:    and w0, w8, #0xff00
 ; CHECK-NEXT:    ret
   %b = call i32 @llvm.bswap.i32(i32 %x)
@@ -77,7 +77,7 @@
 define i32 @demand_one_byte2(i32 %x) {
 ; CHECK-LABEL: demand_one_byte2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev w8, w0
+; CHECK-NEXT:    lsl w8, w0, #8
 ; CHECK-NEXT:    orr w0, w8, #0xff00ffff
 ; CHECK-NEXT:    ret
   %b = call i32 @llvm.bswap.i32(i32 %x)
@@ -88,8 +88,7 @@
 define i64 @demand_one_byte3(i64 %x) {
 ; CHECK-LABEL: demand_one_byte3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    rev x8, x0
-; CHECK-NEXT:    lsr x0, x8, #56
+; CHECK-NEXT:    and x0, x0, #0xff
 ; CHECK-NEXT:    ret
   %b = call i64 @llvm.bswap.i64(i64 %x)
   %r = lshr i64 %b, 56
@@ -99,9 +98,7 @@
 define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
 ; CHECK-LABEL: demand_one_loaded_byte:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    lsr x8, x8, #8
-; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    ldrb w8, [x0, #4]
 ; CHECK-NEXT:    strb w8, [x1]
 ; CHECK-NEXT:    ret
   %x = load i64, i64* %xp, align 8
diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -62,18 +62,13 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    shldl $24, %edx, %ecx
-; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movb 4(%ecx), %cl
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: demand_one_loaded_byte:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    shrq $8, %rax
-; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movb 4(%rdi), %al
 ; X64-NEXT:    movb %al, (%rsi)
 ; X64-NEXT:    retq
   %x = load i64, i64* %xp, align 8