diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4613,6 +4613,8 @@ const SDLoc &dl, SDValue &Chain, bool IsSignaling = false) const; + SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6582,6 +6582,55 @@ return true; } +static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, + SDValue And1_L, SDValue And1_R, + SDLoc DL, SelectionDAG &DAG) { + if (!isBitwiseNot(And0_L, true)) + return SDValue(); + SDValue NotOp = And0_L->getOperand(0); + if (NotOp == And1_R) { + std::swap(And1_R, And1_L); + } + // (~(NotOp) & And0_R) | (NotOp & And1_R) + // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R + if (NotOp == And1_L) { + EVT VT = And1_L->getValueType(0); + SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); + SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); + return Xor1; + } + return SDValue(); +} + +SDValue TargetLowering::foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) const { + // Note that masked-merge variants using XOR or ADD expressions are + // normalized to OR by InstCombine so we only check for OR. + assert(Node->getOpcode() == ISD::OR); + SDValue N0 = Node->getOperand(0); + if (N0->getOpcode() != ISD::AND) + return SDValue(); + SDValue N1 = Node->getOperand(1); + if (N1->getOpcode() != ISD::AND) + return SDValue(); + + SDLoc DL(Node); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) + return Result; + return SDValue(); +} + // TODO: Merge with expandFunnelShift. bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, SDValue &Result, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1478,6 +1478,10 @@ LegalFPImmediates.push_back(Imm); } + SDValue combineOr(SDNode *N, SelectionDAG &DAG, + DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) const; + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46382,9 +46382,9 @@ return Ret; } -static SDValue combineOr(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +SDValue X86TargetLowering::combineOr(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) const { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); @@ -46475,6 +46475,13 @@ return Res; } + // When `andn` is unavailable transform: + // (x & m) | (y & m) --> ((x ^ y) & m) ^ y + if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) { + if (SDValue R = foldMaskedMerge(N, DAG)) + return R; + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc -o - %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=BMI +; +; test that masked-merge code is generated as "xor;and;xor" sequence or +; "andn ; and; or" if and-not is available. + +define i32 @masked_merge(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-LABEL: masked_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: retq +; +; BMI-LABEL: masked_merge: +; BMI: # %bb.0: +; BMI-NEXT: andl %edi, %esi +; BMI-NEXT: andnl %edx, %edi, %eax +; BMI-NEXT: orl %esi, %eax +; BMI-NEXT: retq + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @masked_merge1(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-LABEL: masked_merge1: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: retq +; +; BMI-LABEL: masked_merge1: +; BMI: # %bb.0: +; BMI-NEXT: andl %edi, %esi +; BMI-NEXT: andnl %edx, %edi, %eax +; BMI-NEXT: orl %esi, %eax +; BMI-NEXT: retq + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %a2, %not + %or = or i32 %and1, %and0 + ret i32 %or +} + +define i32 @masked_merge2(i32, i32, i32) { +; CHECK-LABEL: masked_merge2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: notl %esi +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: notl %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: retq +; +; BMI-LABEL: masked_merge2: +; BMI: # %bb.0: +; BMI-NEXT: notl %edx +; BMI-NEXT: andnl %edx, %edi, %ecx +; BMI-NEXT: andnl %edi, %esi, %eax +; BMI-NEXT: orl %ecx, %eax +; BMI-NEXT: retq + %v0 = xor i32 %1, -1 + %v1 = xor i32 %2, -1 + %not = xor i32 %0, -1 + %and0 = and i32 %not, %v1 + %and1 = and i32 %v0, %0 + %or = or i32 %and0, %and1 + ret i32 %or +}