diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -129,6 +129,12 @@ "gather intrinsics"), cl::init(true)); +// All of the XOR, OR and CMP use ALU ports, and data dependency will become the +// bottleneck after this transform on high end CPU. So this max leaf node +// limitation is guard cmp+ccmp will be profitable. +static cl::opt MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, + cl::desc("Maximum of xors")); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -8551,6 +8557,77 @@ DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); } +// Check whether the continuous comparison sequence. +static bool +isOrXorChain(SDValue N, unsigned &Num, + SmallVector, 16> &WorkList) { + if (Num == MaxXors) + return false; + + // The leaf node must be XOR + if (N->getOpcode() == ISD::XOR) { + WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); + Num++; + return true; + } + + // All the non-leaf nodes must be OR. + if (N->getOpcode() != ISD::OR || !N->hasOneUse()) + return false; + + if (isOrXorChain(N->getOperand(0), Num, WorkList) && + isOrXorChain(N->getOperand(1), Num, WorkList)) + return true; + return false; +} + +// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp. +static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDLoc DL(N); + EVT VT = N->getValueType(0); + SmallVector, 16> WorkList; + + // Only handle integer compares. + if (N->getOpcode() != ISD::SETCC) + return SDValue(); + + ISD::CondCode Cond = cast(N->getOperand(2))->get(); + // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: + // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag + unsigned NumXors = 0; + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && + LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && + isOrXorChain(LHS, NumXors, WorkList)) { + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC); + EVT TstVT = LHS->getValueType(0); + SDValue XOR0, XOR1; + std::tie(XOR0, XOR1) = WorkList[0]; + SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, + DAG.getVTList(TstVT, MVT::i32), XOR0, XOR1); + SDValue Overflow = Cmp.getValue(1); + SDValue CCmp; + for (unsigned I = 1; I < WorkList.size(); I++) { + std::tie(XOR0, XOR1) = WorkList[I]; + SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32); + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR0, XOR1, NZCVOp, + CCVal, Overflow); + Overflow = CCmp; + } + + // Exit early by inverting the condition, which help reduce indentations. + SDValue TVal = DAG.getConstant(1, DL, VT); + SDValue FVal = DAG.getConstant(0, DL, VT); + AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond); + AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, + DAG.getConstant(InvCC, DL, MVT::i32), CCmp); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) @@ -8586,6 +8663,11 @@ } } + // Address some cases folded And in the stage of `Optimized type-legalized + // selection` + if (SDValue V = performOrXorChainCombine(Op.getNode(), DAG)) + return V; + if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp( @@ -19625,34 +19707,10 @@ } } - // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: - // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag - if (!DCI.isBeforeLegalize() && VT.isScalarInteger() && - (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && - LHS->getOpcode() == ISD::OR && - (LHS.getOperand(0)->getOpcode() == ISD::XOR && - LHS.getOperand(1)->getOpcode() == ISD::XOR) && - LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() && - LHS.getOperand(1)->hasOneUse()) { - SDValue XOR0 = LHS.getOperand(0); - SDValue XOR1 = LHS.getOperand(1); - SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC); - EVT TstVT = LHS->getValueType(0); - SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32), - XOR0.getOperand(0), XOR0.getOperand(1)); - SDValue Overflow = Cmp.getValue(1); - SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32); - SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0), - XOR1.getOperand(1), NZCVOp, CCVal, Overflow); - // Invert CSEL's operands. - SDValue TVal = DAG.getConstant(1, DL, VT); - SDValue FVal = DAG.getConstant(0, DL, VT); - AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond); - AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC); - return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, - DAG.getConstant(InvCC, DL, MVT::i32), CCmp); - } + // Try to perform the memcmp when the result is tested for [in]equality with 0 + if (!DCI.isBeforeLegalize()) + if (SDValue V = performOrXorChainCombine(N, DAG)) + return V; return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll --- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll +++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll @@ -70,17 +70,13 @@ ; CHECKN-NEXT: ldp x8, x9, [x0] ; CHECKN-NEXT: ldp x10, x11, [x1] ; CHECKN-NEXT: ldr x12, [x0, #16] -; CHECKN-NEXT: ldr x13, [x1, #16] -; CHECKN-NEXT: ldur x14, [x0, #23] -; CHECKN-NEXT: eor x8, x8, x10 -; CHECKN-NEXT: ldur x15, [x1, #23] -; CHECKN-NEXT: eor x9, x9, x11 -; CHECKN-NEXT: eor x10, x12, x13 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: eor x11, x14, x15 -; CHECKN-NEXT: orr x9, x10, x11 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: cmp x8, #0 +; CHECKN-NEXT: cmp x8, x10 +; CHECKN-NEXT: ldr x8, [x1, #16] +; CHECKN-NEXT: ccmp x9, x11, #0, eq +; CHECKN-NEXT: ldur x9, [x0, #23] +; CHECKN-NEXT: ldur x10, [x1, #23] +; CHECKN-NEXT: ccmp x12, x8, #0, eq +; CHECKN-NEXT: ccmp x9, x10, #0, eq ; CHECKN-NEXT: cset w0, eq ; CHECKN-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll --- a/llvm/test/CodeGen/AArch64/bcmp.ll +++ b/llvm/test/CodeGen/AArch64/bcmp.ll @@ -39,6 +39,7 @@ ret i1 %r } +; or (and (xor a, b), C1), (and (xor c, d), C2) define i1 @bcmp3(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp3: ; CHECK: // %bb.0: @@ -46,10 +47,8 @@ ; CHECK-NEXT: ldrh w9, [x1] ; CHECK-NEXT: ldrb w10, [x0, #2] ; CHECK-NEXT: ldrb w11, [x1, #2] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3) @@ -70,6 +69,7 @@ ret i1 %r } +; or (xor a, b), (and (xor c, d), C2) define i1 @bcmp5(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp5: ; CHECK: // %bb.0: @@ -77,10 +77,8 @@ ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: ldrb w10, [x0, #4] ; CHECK-NEXT: ldrb w11, [x1, #4] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5) @@ -88,6 +86,7 @@ ret i1 %r } +; or (xor a, b), (and (xor c, d), C2) define i1 @bcmp6(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp6: ; CHECK: // %bb.0: @@ -95,10 +94,8 @@ ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: ldrh w10, [x0, #4] ; CHECK-NEXT: ldrh w11, [x1, #4] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6) @@ -106,6 +103,7 @@ ret i1 %r } +; or (xor a, b), (xor c, d) define i1 @bcmp7(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp7: ; CHECK: // %bb.0: @@ -135,6 +133,7 @@ ret i1 %r } +; TODO: or (xor a, b), (and (xor c, d), C2) define i1 @bcmp9(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp9: ; CHECK: // %bb.0: @@ -295,13 +294,10 @@ ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] ; CHECK-NEXT: ldr x12, [x0, #16] -; CHECK-NEXT: ldr x13, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: eor x10, x12, x13 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ldr x8, [x1, #16] +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ccmp x12, x8, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24) @@ -365,20 +361,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: ldp x12, x13, [x0, #16] -; CHECK-NEXT: ldp x14, x15, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: ldur x10, [x0, #30] -; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq ; CHECK-NEXT: ldur x11, [x1, #30] -; CHECK-NEXT: eor x12, x12, x14 -; CHECK-NEXT: eor x13, x13, x15 -; CHECK-NEXT: orr x9, x12, x13 -; CHECK-NEXT: eor x10, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: ldp x8, x9, [x0, #16] +; CHECK-NEXT: ldp x12, x10, [x1, #16] +; CHECK-NEXT: ccmp x8, x12, #0, eq +; CHECK-NEXT: ldur x8, [x0, #30] +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: ccmp x8, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38) @@ -391,24 +382,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: ldp x12, x13, [x0, #16] -; CHECK-NEXT: ldp x14, x15, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: ldr x16, [x0, #32] -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: ldr x17, [x1, #32] -; CHECK-NEXT: ldur x18, [x0, #37] -; CHECK-NEXT: eor x10, x12, x14 -; CHECK-NEXT: ldur x0, [x1, #37] -; CHECK-NEXT: eor x11, x13, x15 -; CHECK-NEXT: eor x12, x16, x17 -; CHECK-NEXT: orr x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: eor x13, x18, x0 -; CHECK-NEXT: orr x10, x12, x13 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldr x11, [x1, #32] +; CHECK-NEXT: ldp x8, x9, [x0, #16] +; CHECK-NEXT: ldp x12, x10, [x1, #16] +; CHECK-NEXT: ccmp x8, x12, #0, eq +; CHECK-NEXT: ldr x8, [x0, #32] +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: ldur x9, [x0, #37] +; CHECK-NEXT: ldur x10, [x1, #37] +; CHECK-NEXT: ccmp x8, x11, #0, eq +; CHECK-NEXT: ccmp x9, x10, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45) @@ -416,33 +401,31 @@ ret i1 %r } +; Although the large cmp chain may be not profitable on high end CPU, we +; believe it is better on most cpus, so perform the transform now. +; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine +; 2 cycle for xor +; 3 cycle for or +; 1 cycle for cmp define i1 @bcmp64(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: ldp x12, x13, [x0, #16] -; CHECK-NEXT: ldp x14, x15, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: ldp x16, x17, [x0, #32] -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: ldp x18, x2, [x1, #32] -; CHECK-NEXT: eor x12, x12, x14 -; CHECK-NEXT: eor x13, x13, x15 -; CHECK-NEXT: ldp x3, x0, [x0, #48] -; CHECK-NEXT: orr x9, x12, x13 -; CHECK-NEXT: ldp x10, x11, [x1, #48] -; CHECK-NEXT: eor x14, x16, x18 -; CHECK-NEXT: eor x15, x17, x2 -; CHECK-NEXT: orr x12, x14, x15 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: eor x10, x3, x10 -; CHECK-NEXT: eor x11, x0, x11 -; CHECK-NEXT: orr x10, x10, x11 -; CHECK-NEXT: orr x9, x12, x10 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldp x8, x9, [x0, #16] +; CHECK-NEXT: ldp x12, x10, [x1, #16] +; CHECK-NEXT: ccmp x8, x12, #0, eq +; CHECK-NEXT: ldp x8, x11, [x1, #32] +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: ldp x9, x10, [x0, #32] +; CHECK-NEXT: ccmp x9, x8, #0, eq +; CHECK-NEXT: ccmp x10, x11, #0, eq +; CHECK-NEXT: ldp x9, x10, [x0, #48] +; CHECK-NEXT: ldp x8, x11, [x1, #48] +; CHECK-NEXT: ccmp x9, x8, #0, eq +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)