Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1806,6 +1806,39 @@ } case ISD::BSWAP: { SDValue Src = Op.getOperand(0); + + // If the only bits demanded come from one byte of the bswap result, + // just shift the input byte into position to eliminate the bswap. + unsigned NLZ = DemandedBits.countLeadingZeros(); + unsigned NTZ = DemandedBits.countTrailingZeros(); + + // Round NTZ down to the next byte. If we have 11 trailing zeros, then + // we need all the bits down to bit 8. Likewise, round NLZ. If we + // have 14 leading zeros, round to 8. + NLZ &= ~7; + NTZ &= ~7; + // If we need exactly one byte, we can do this transformation. + if (BitWidth - NLZ - NTZ == 8) { + unsigned ResultBit = NTZ; + unsigned InputBit = BitWidth - NTZ - 8; + + // Replace this with either a left or right shift to get the byte into + // the right place. + EVT ShAmtTy = VT; + if (TLO.LegalTypes() && !ShAmtTy.isVector()) + ShAmtTy = getShiftAmountTy(ShAmtTy, DL); + + if (InputBit > ResultBit) { + SDValue ShAmt = TLO.DAG.getConstant(InputBit - ResultBit, dl, ShAmtTy); + SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, Src, ShAmt); + return TLO.CombineTo(Op, NewOp); + } else { + SDValue ShAmt = TLO.DAG.getConstant(ResultBit - InputBit, dl, ShAmtTy); + SDValue NewOp = TLO.DAG.getNode(ISD::SHL, dl, VT, Src, ShAmt); + return TLO.CombineTo(Op, NewOp); + } + } + APInt DemandedSrcBits = DemandedBits.byteSwap(); if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO, Depth + 1)) Index: llvm/test/CodeGen/AArch64/bswap-known-bits.ll =================================================================== --- llvm/test/CodeGen/AArch64/bswap-known-bits.ll +++ llvm/test/CodeGen/AArch64/bswap-known-bits.ll @@ -66,7 +66,7 @@ define i32 @demand_one_byte1(i32 %x) { ; CHECK-LABEL: demand_one_byte1: ; CHECK: ; %bb.0: -; CHECK-NEXT: rev w8, w0 +; CHECK-NEXT: lsr w8, w0, #8 ; CHECK-NEXT: and w0, w8, #0xff00 ; CHECK-NEXT: ret %b = call i32 @llvm.bswap.i32(i32 %x) @@ -77,7 +77,7 @@ define i32 @demand_one_byte2(i32 %x) { ; CHECK-LABEL: demand_one_byte2: ; CHECK: ; %bb.0: -; CHECK-NEXT: rev w8, w0 +; CHECK-NEXT: lsl w8, w0, #8 ; CHECK-NEXT: orr w0, w8, #0xff00ffff ; CHECK-NEXT: ret %b = call i32 @llvm.bswap.i32(i32 %x) @@ -88,8 +88,7 @@ define i64 @demand_one_byte3(i64 %x) { ; CHECK-LABEL: demand_one_byte3: ; CHECK: ; %bb.0: -; CHECK-NEXT: rev x8, x0 -; CHECK-NEXT: lsr x0, x8, #56 +; CHECK-NEXT: and x0, x0, #0xff ; CHECK-NEXT: ret %b = call i64 @llvm.bswap.i64(i64 %x) %r = lshr i64 %b, 56 @@ -99,9 +98,7 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) { ; CHECK-LABEL: demand_one_loaded_byte: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: lsr x8, x8, #8 -; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: ldrb w8, [x0, #4] ; CHECK-NEXT: strb w8, [x1] ; CHECK-NEXT: ret %x = load i64, i64* %xp, align 8 Index: llvm/test/CodeGen/X86/combine-bswap.ll =================================================================== --- llvm/test/CodeGen/X86/combine-bswap.ll +++ llvm/test/CodeGen/X86/combine-bswap.ll @@ -62,18 +62,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: shldl $24, %edx, %ecx -; X86-NEXT: bswapl %ecx +; X86-NEXT: movb 4(%ecx), %cl ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: demand_one_loaded_byte: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: shrq $8, %rax -; X64-NEXT: bswapl %eax +; X64-NEXT: movb 4(%rdi), %al ; X64-NEXT: movb %al, (%rsi) ; X64-NEXT: retq %x = load i64, i64* %xp, align 8