Index: llvm/lib/Analysis/InstructionSimplify.cpp =================================================================== --- llvm/lib/Analysis/InstructionSimplify.cpp +++ llvm/lib/Analysis/InstructionSimplify.cpp @@ -6026,6 +6026,10 @@ break; case Intrinsic::bswap: // bswap(bswap(x)) -> x + // The duplicate bswap might not come from real world code, but can be + // introduced by other optimizations + // e.g. bswap(logic_op(bswap(x), bswap(y))) + // --> bswap(bswap(logic_op(x, y))) if (match(Op0, m_BSwap(m_Value(X)))) return X; break; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9968,6 +9968,35 @@ : DAG.getZExtOrTrunc(Result, DL, WideVT)); } +// fold (bswap (logic_op(bswap(x),y))) -> logic_op(x,bswap(y)) +// This helper function accept ISD::BSWAP and ISD::BITREVERSE in Opcode +// parameter +static SDValue foldBitOrderCrossLogicOp(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::BSWAP && Opcode != ISD::BITREVERSE) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc DL(N); + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && N0.hasOneUse()) { + SDValue OldLHS = N0.getOperand(0); + SDValue OldRHS = N0.getOperand(1); + + // Need to ensure logic_op and bswap/bitreverse(x) doesn't have other uses + if (OldLHS.getOpcode() == Opcode && OldLHS.hasOneUse()) { + SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldRHS); + return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0), + NewBitReorder); + } else if (OldRHS.getOpcode() == Opcode && OldRHS.hasOneUse()) { + SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldLHS); + return DAG.getNode(N0.getOpcode(), DL, VT, OldRHS.getOperand(0), + NewBitReorder); + } + } + return SDValue(); +} + SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -10753,6 +10782,9 @@ } } + if (SDValue V = foldBitOrderCrossLogicOp(N, DAG)) + return V; + return SDValue(); } Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1266,6 +1266,37 @@ return new ShuffleVectorInst(NewIntrinsic, Mask); } +/// Fold two bit reordering intrinsics(bswap/bitreverse) with f(f(x)) = x +/// crossing logical op, into single instruction in the operand of the logical +/// op. The value V is given as the operand of the outer bswap/bitreverse +/// instrinsic +/// e.g. bswap(logic_op(bswap(x), y)) --> logic_op(x, bswap(y)) +template +static Instruction *foldBitOrderCrossLogicOp(Value *V, InstCombiner::BuilderTy &Builder) { + if (IntrID != Intrinsic::bswap && IntrID != Intrinsic::bitreverse) + return nullptr; + + Value *X, *Y; + if (match(V, m_OneUse(m_BitwiseLogic(m_Value(X), m_Value(Y))))) { + Value *OldReorder; + BinaryOperator::BinaryOps Op = cast(V)->getOpcode(); + + // At least one side need to be bswap/bitreverse so that the number of + // bswap can be reduced. + // We also need to ensure the operands whose bswap/bitreverse is being + // removed have no other uses. + if (match(X, m_Intrinsic(m_Value(OldReorder))) && X->hasOneUse()) { + Value *NewReorder = Builder.CreateUnaryIntrinsic(IntrID, Y); + return BinaryOperator::Create(Op, OldReorder, NewReorder); + } else if (match(Y, m_Intrinsic(m_Value(OldReorder))) && + Y->hasOneUse()) { + Value *NewReorder = Builder.CreateUnaryIntrinsic(IntrID, X); + return BinaryOperator::Create(Op, NewReorder, OldReorder); + } + } + return nullptr; +} + /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallBase to do the heavy /// lifting. @@ -1669,6 +1700,12 @@ Value *V = Builder.CreateLShr(X, CV); return new TruncInst(V, IIOperand->getType()); } + + if (Instruction *crossLogicOpFold = + foldBitOrderCrossLogicOp(IIOperand, Builder)) { + return crossLogicOpFold; + } + break; } case Intrinsic::masked_load: Index: llvm/test/CodeGen/X86/combine-bswap.ll =================================================================== --- llvm/test/CodeGen/X86/combine-bswap.ll +++ llvm/test/CodeGen/X86/combine-bswap.ll @@ -261,15 +261,13 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bswapl %eax ; X86-NEXT: retl ; ; X64-LABEL: bs_and_lhs_bs32: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: bswapl %eax -; X64-NEXT: andl %esi, %eax +; X64-NEXT: movl %esi, %eax ; X64-NEXT: bswapl %eax +; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.bswap.i32(i32 %a) %2 = and i32 %1, %b @@ -280,22 +278,19 @@ define i64 @bs_and_lhs_bs64(i64 %a, i64 %b) #0 { ; X86-LABEL: bs_and_lhs_bs64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %edx -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bswapl %eax ; X86-NEXT: bswapl %edx +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl ; ; X64-LABEL: bs_and_lhs_bs64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: bswapq %rax -; X64-NEXT: andq %rsi, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: bswapq %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.bswap.i64(i64 %a) %2 = and i64 %1, %b @@ -306,22 +301,19 @@ define i64 @bs_and_rhs_bs64(i64 %a, i64 %b) #0 { ; X86-LABEL: bs_and_rhs_bs64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %edx -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bswapl %eax ; X86-NEXT: bswapl %edx +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl ; ; X64-LABEL: bs_and_rhs_bs64: ; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: bswapq %rax -; X64-NEXT: andq %rdi, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: bswapq %rax +; X64-NEXT: andq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.bswap.i64(i64 %b) %2 = and i64 %1, %a Index: llvm/test/Transforms/InstCombine/bswap-fold.ll =================================================================== --- llvm/test/Transforms/InstCombine/bswap-fold.ll +++ llvm/test/Transforms/InstCombine/bswap-fold.ll @@ -543,10 +543,9 @@ ; Fold: BSWAP( OP( BSWAP(x), y ) ) -> OP( x, BSWAP(y) ) define i16 @bs_and_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_and_lhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[A:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) %2 = and i16 %1, %b @@ -556,10 +555,9 @@ define i16 @bs_or_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_or_lhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[A:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) %2 = or i16 %1, %b @@ -569,10 +567,9 @@ define i16 @bs_xor_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_xor_lhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[A:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) %2 = xor i16 %1, %b @@ -582,10 +579,9 @@ define i16 @bs_and_rhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_and_rhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %b) %2 = and i16 %1, %a @@ -595,10 +591,9 @@ define i16 @bs_or_rhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_or_rhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %b) %2 = or i16 %1, %a @@ -608,10 +603,9 @@ define i16 @bs_xor_rhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_xor_rhs_bs16( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %b) %2 = xor i16 %1, %a @@ -621,10 +615,9 @@ define i32 @bs_and_rhs_bs32(i32 %a, i32 %b) #0 { ; CHECK-LABEL: @bs_and_rhs_bs32( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] ; %1 = tail call i32 @llvm.bswap.i32(i32 %b) %2 = and i32 %1, %a @@ -634,10 +627,9 @@ define i32 @bs_or_rhs_bs32(i32 %a, i32 %b) #0 { ; CHECK-LABEL: @bs_or_rhs_bs32( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] ; %1 = tail call i32 @llvm.bswap.i32(i32 %b) %2 = or i32 %1, %a @@ -647,10 +639,9 @@ define i32 @bs_xor_rhs_bs32(i32 %a, i32 %b) #0 { ; CHECK-LABEL: @bs_xor_rhs_bs32( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[TMP2]] ; %1 = tail call i32 @llvm.bswap.i32(i32 %b) %2 = xor i32 %1, %a @@ -660,10 +651,9 @@ define i64 @bs_and_rhs_bs64(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_and_rhs_bs64( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] ; %1 = tail call i64 @llvm.bswap.i64(i64 %b) %2 = and i64 %1, %a @@ -673,10 +663,9 @@ define i64 @bs_or_rhs_bs64(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_or_rhs_bs64( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] ; %1 = tail call i64 @llvm.bswap.i64(i64 %b) %2 = or i64 %1, %a @@ -686,10 +675,9 @@ define i64 @bs_xor_rhs_bs64(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_xor_rhs_bs64( -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[TMP2]] ; %1 = tail call i64 @llvm.bswap.i64(i64 %b) %2 = xor i64 %1, %a @@ -699,10 +687,9 @@ define <2 x i32> @bs_and_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { ; CHECK-LABEL: @bs_and_rhs_i32vec( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP2]]) -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) %2 = and <2 x i32> %1, %a @@ -712,10 +699,9 @@ define <2 x i32> @bs_or_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { ; CHECK-LABEL: @bs_or_rhs_i32vec( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP2]]) -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) %2 = or <2 x i32> %1, %a @@ -725,10 +711,9 @@ define <2 x i32> @bs_xor_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { ; CHECK-LABEL: @bs_xor_rhs_i32vec( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP1]], [[A:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP2]]) -; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) %2 = xor <2 x i32> %1, %a