diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10753,6 +10753,21 @@ } } + // fold (bswap (logic_op(bswap(x),y))) -> logic_op(x,bswap(y)) + // Need to ensure logic_op and bswap(x) doesn't have other uses + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && N0.hasOneUse()) { + SDValue OldLHS = N0.getOperand(0); + SDValue OldRHS = N0.getOperand(1); + + if (OldLHS.getOpcode() == ISD::BSWAP && OldLHS.hasOneUse()) { + SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, OldRHS); + return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0), NewSwap); + } else if (OldRHS.getOpcode() == ISD::BSWAP && OldRHS.hasOneUse()) { + SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, OldLHS); + return DAG.getNode(N0.getOpcode(), DL, VT, OldRHS.getOperand(0), NewSwap); + } + } + return SDValue(); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -67,7 +67,7 @@ if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) { // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) ) - if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse()) + if (!OldLHS->hasOneUse() || !OldRHS->hasOneUse()) return nullptr; // NewRHS initialized by the matcher. } else if (match(OldRHS, m_APInt(C))) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1625,6 +1625,14 @@ case Intrinsic::bswap: { Value *IIOperand = II->getArgOperand(0); + // The duplicate bswap might not come from real world code, but can be + // introduced by other optimizations + // e.g. bswap(logic_op(bswap(x), bswap(y))) + // --> bswap(bswap(logic_op(x, y))) + if (Value * V; match(IIOperand, m_BSwap(m_Value(V)))) { + return replaceInstUsesWith(CI, V); + } + // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as // inverse-shift-of-bswap: // bswap (shl X, Y) --> lshr (bswap X), Y @@ -1669,6 +1677,25 @@ Value *V = Builder.CreateLShr(X, CV); return new TruncInst(V, IIOperand->getType()); } + + // bswap(logic_op(bswap(x), y)) --> logic_op(x, bswap(y)) + if (match(IIOperand, m_OneUse(m_BitwiseLogic(m_Value(X), m_Value(Y))))) { + Value *OldSwap; + BinaryOperator::BinaryOps Op = + cast(IIOperand)->getOpcode(); + + // At least one side need to be bswap so that the number of bswap can be + // reduced + // We also need to ensure the operands whose bswap is being removed have + // no other uses + if (match(X, m_BSwap(m_Value(OldSwap))) && X->hasOneUse()) { + Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, Y); + return BinaryOperator::Create(Op, OldSwap, NewSwap); + } else if (match(Y, m_BSwap(m_Value(OldSwap))) && Y->hasOneUse()) { + Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X); + return BinaryOperator::Create(Op, NewSwap, OldSwap); + } + } break; } case Intrinsic::masked_load: diff --git a/llvm/test/Transforms/InstCombine/bswap-fold.ll b/llvm/test/Transforms/InstCombine/bswap-fold.ll --- a/llvm/test/Transforms/InstCombine/bswap-fold.ll +++ b/llvm/test/Transforms/InstCombine/bswap-fold.ll @@ -498,8 +498,8 @@ define i64 @bs_and64_multiuse2(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_and64_multiuse2( ; CHECK-NEXT: [[T1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A]], [[B:%.*]] -; CHECK-NEXT: [[T3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) +; CHECK-NEXT: [[T2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) +; CHECK-NEXT: [[T3:%.*]] = and i64 [[T1]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = mul i64 [[T3]], [[T1]] ; CHECK-NEXT: ret i64 [[T4]] ; @@ -512,9 +512,9 @@ define i64 @bs_and64_multiuse3(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_and64_multiuse3( +; CHECK-NEXT: [[T1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]]) ; CHECK-NEXT: [[T2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], [[B]] -; CHECK-NEXT: [[T3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) +; CHECK-NEXT: [[T3:%.*]] = and i64 [[T1]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = mul i64 [[T3]], [[T2]] ; CHECK-NEXT: ret i64 [[T4]] ; @@ -539,6 +539,231 @@ } +; Issue#62236 +; Fold: BSWAP( OP( BSWAP(x), y ) ) -> OP( x, BSWAP(y) ) +define i16 @bs_and_lhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_and_lhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %a) + %2 = and i16 %1, %b + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i16 @bs_or_lhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_or_lhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %a) + %2 = or i16 %1, %b + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i16 @bs_xor_lhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_xor_lhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %a) + %2 = xor i16 %1, %b + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i16 @bs_and_rhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_and_rhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %b) + %2 = and i16 %1, %a + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i16 @bs_or_rhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_or_rhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %b) + %2 = or i16 %1, %a + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i16 @bs_xor_rhs_bs16(i16 %a, i16 %b) #0 { +; CHECK-LABEL: @bs_xor_rhs_bs16( +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] +; + %1 = tail call i16 @llvm.bswap.i16(i16 %b) + %2 = xor i16 %1, %a + %3 = tail call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define i32 @bs_and_rhs_bs32(i32 %a, i32 %b) #0 { +; CHECK-LABEL: @bs_and_rhs_bs32( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = tail call i32 @llvm.bswap.i32(i32 %b) + %2 = and i32 %1, %a + %3 = tail call i32 @llvm.bswap.i32(i32 %2) + ret i32 %3 +} + +define i32 @bs_or_rhs_bs32(i32 %a, i32 %b) #0 { +; CHECK-LABEL: @bs_or_rhs_bs32( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = tail call i32 @llvm.bswap.i32(i32 %b) + %2 = or i32 %1, %a + %3 = tail call i32 @llvm.bswap.i32(i32 %2) + ret i32 %3 +} + +define i32 @bs_xor_rhs_bs32(i32 %a, i32 %b) #0 { +; CHECK-LABEL: @bs_xor_rhs_bs32( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = tail call i32 @llvm.bswap.i32(i32 %b) + %2 = xor i32 %1, %a + %3 = tail call i32 @llvm.bswap.i32(i32 %2) + ret i32 %3 +} + +define i64 @bs_and_rhs_bs64(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and_rhs_bs64( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %b) + %2 = and i64 %1, %a + %3 = tail call i64 @llvm.bswap.i64(i64 %2) + ret i64 %3 +} + +define i64 @bs_or_rhs_bs64(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_or_rhs_bs64( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %b) + %2 = or i64 %1, %a + %3 = tail call i64 @llvm.bswap.i64(i64 %2) + ret i64 %3 +} + +define i64 @bs_xor_rhs_bs64(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_xor_rhs_bs64( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %b) + %2 = xor i64 %1, %a + %3 = tail call i64 @llvm.bswap.i64(i64 %2) + ret i64 %3 +} + +define <2 x i32> @bs_and_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_and_rhs_i32vec( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %2 = and <2 x i32> %1, %a + %3 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +define <2 x i32> @bs_or_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_or_rhs_i32vec( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %2 = or <2 x i32> %1, %a + %3 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +define <2 x i32> @bs_xor_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_xor_rhs_i32vec( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %2 = xor <2 x i32> %1, %a + %3 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +define i64 @bs_and_rhs_bs64_multiuse1(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and_rhs_bs64_multiuse1( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %b) + %2 = and i64 %1, %a + %3 = tail call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %2, %3 ;increase use of logical op + ret i64 %4 +} + +define i64 @bs_and_rhs_bs64_multiuse2(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and_rhs_bs64_multiuse2( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %b) + %2 = and i64 %1, %a + %3 = tail call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %1, %3 ;increase use of inner bswap + ret i64 %4 +} + +define i64 @bs_all_operand64(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_all_operand64( +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP1]] +; + %1 = tail call i64 @llvm.bswap.i64(i64 %a) + %2 = tail call i64 @llvm.bswap.i64(i64 %b) + %3 = and i64 %1, %2 + %4 = tail call i64 @llvm.bswap.i64(i64 %3) + ret i64 %4 +} + + define i64 @bs_active_high8(i64 %0) { ; CHECK-LABEL: @bs_active_high8( ; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP0:%.*]], 255