diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9588,6 +9588,26 @@ return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap); } + // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2)))))) + // iff x >= bw/2 (i.e. lower half is known zero) + unsigned BW = VT.getScalarSizeInBits(); + if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) { + auto *ShAmt = dyn_cast(N0.getOperand(1)); + EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2); + if (ShAmt && ShAmt->getAPIntValue().ult(BW) && + ShAmt->getZExtValue() >= (BW / 2) && + (ShAmt->getZExtValue() % 16) == 0 && TLI.isTruncateFree(VT, HalfVT) && + (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) { + SDValue Res = N0.getOperand(0); + if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2))) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT))); + Res = DAG.getZExtOrTrunc(Res, DL, HalfVT); + Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res); + return DAG.getZExtOrTrunc(Res, DL, VT); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll --- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -442,8 +442,8 @@ ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -499,8 +499,8 @@ ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: rev w8, w8 +; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll --- a/llvm/test/CodeGen/X86/combine-bswap.ll +++ b/llvm/test/CodeGen/X86/combine-bswap.ll @@ -87,17 +87,16 @@ define i64 @test_bswap64_shift48_zext(i16 %a0) { ; X86-LABEL: test_bswap64_shift48_zext: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $16, %eax -; X86-NEXT: bswapl %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bswap64_shift48_zext: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shlq $48, %rax -; X64-NEXT: bswapq %rax +; X64-NEXT: rolw $8, %di +; X64-NEXT: movzwl %di, %eax ; X64-NEXT: retq %z = zext i16 %a0 to i64 %s = shl i64 %z, 48 @@ -109,16 +108,15 @@ ; X86-LABEL: test_bswap64_shift48: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $16, %eax -; X86-NEXT: bswapl %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bswap64_shift48: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlq $48, %rax -; X64-NEXT: bswapq %rax +; X64-NEXT: rolw $8, %di +; X64-NEXT: movzwl %di, %eax ; X64-NEXT: retq %s = shl i64 %a0, 48 %b = call i64 @llvm.bswap.i64(i64 %s) diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1209,20 +1209,33 @@ ; i8* p; ; (i32) p[1] | ((i32) p[0] << 8) define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { -; CHECK-LABEL: zext_load_i32_by_i8_bswap: -; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: bswapl %eax -; CHECK-NEXT: retl +; BSWAP-LABEL: zext_load_i32_by_i8_bswap: +; BSWAP: # %bb.0: +; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax +; BSWAP-NEXT: movzwl (%eax), %eax +; BSWAP-NEXT: rolw $8, %ax +; BSWAP-NEXT: movzwl %ax, %eax +; BSWAP-NEXT: retl ; -; CHECK64-LABEL: zext_load_i32_by_i8_bswap: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl (%rdi), %eax -; CHECK64-NEXT: shll $16, %eax -; CHECK64-NEXT: bswapl %eax -; CHECK64-NEXT: retq +; MOVBE-LABEL: zext_load_i32_by_i8_bswap: +; MOVBE: # %bb.0: +; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax +; MOVBE-NEXT: movbew (%eax), %ax +; MOVBE-NEXT: movzwl %ax, %eax +; MOVBE-NEXT: retl +; +; BSWAP64-LABEL: zext_load_i32_by_i8_bswap: +; BSWAP64: # %bb.0: +; BSWAP64-NEXT: movzwl (%rdi), %eax +; BSWAP64-NEXT: rolw $8, %ax +; BSWAP64-NEXT: movzwl %ax, %eax +; BSWAP64-NEXT: retq +; +; MOVBE64-LABEL: zext_load_i32_by_i8_bswap: +; MOVBE64: # %bb.0: +; MOVBE64-NEXT: movbew (%rdi), %ax +; MOVBE64-NEXT: movzwl %ax, %eax +; MOVBE64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 %tmp2 = load i8, i8* %tmp1, align 1