Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8581,6 +8581,10 @@ if (Num == MaxXors) return false; + // Skip the one-use zext + if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) + N = N->getOperand(0); + // The leaf node must be XOR if (N->getOpcode() == ISD::XOR) { WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); @@ -8617,29 +8621,18 @@ if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && isOrXorChain(LHS, NumXors, WorkList)) { - SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC); - EVT TstVT = LHS->getValueType(0); SDValue XOR0, XOR1; std::tie(XOR0, XOR1) = WorkList[0]; - SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, - DAG.getVTList(TstVT, MVT::i32), XOR0, XOR1); - SDValue Overflow = Cmp.getValue(1); - SDValue CCmp; + unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR; + SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); for (unsigned I = 1; I < WorkList.size(); I++) { std::tie(XOR0, XOR1) = WorkList[I]; - SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32); - CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR0, XOR1, NZCVOp, - CCVal, Overflow); - Overflow = CCmp; + SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); + Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain); } // Exit early by inverting the condition, which help reduce indentations. - SDValue TVal = DAG.getConstant(1, DL, VT); - SDValue FVal = DAG.getConstant(0, DL, VT); - AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond); - AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC); - return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, - DAG.getConstant(InvCC, DL, MVT::i32), CCmp); + return Cmp; } return SDValue(); @@ -8680,11 +8673,6 @@ } } - // Address some cases folded And in the stage of `Optimized type-legalized - // selection` - if (SDValue V = performOrXorChainCombine(Op.getNode(), DAG)) - return V; - if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp( @@ -19757,9 +19745,8 @@ } // Try to perform the memcmp when the result is tested for [in]equality with 0 - if (!DCI.isBeforeLegalize()) - if (SDValue V = performOrXorChainCombine(N, DAG)) - return V; + if (SDValue V = performOrXorChainCombine(N, DAG)) + return V; return SDValue(); } Index: llvm/test/CodeGen/AArch64/atomicrmw-O0.ll =================================================================== --- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -215,41 +215,40 @@ ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 -; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: adds x14, x13, #1 -; NOLSE-NEXT: cinc x15, x11, hs +; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x11, #1 +; NOLSE-NEXT: cinc x15, x13, hs ; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: ldaxp x10, x12, [x9] +; NOLSE-NEXT: cmp x10, x11 +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cset w9, ne -; NOLSE-NEXT: cmp x8, x11 -; NOLSE-NEXT: cinc w9, w9, ne -; NOLSE-NEXT: cbnz w9, .LBB4_4 +; NOLSE-NEXT: cinc w8, w8, ne +; NOLSE-NEXT: cbnz w8, .LBB4_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w9, x14, x15, [x10] -; NOLSE-NEXT: cbnz w9, .LBB4_2 +; NOLSE-NEXT: stlxp w8, x14, x15, [x9] +; NOLSE-NEXT: cbnz w8, .LBB4_2 ; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w9, x12, x8, [x10] -; NOLSE-NEXT: cbnz w9, .LBB4_2 +; NOLSE-NEXT: stlxp w8, x10, x12, [x9] +; NOLSE-NEXT: cbnz w8, .LBB4_2 ; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 -; NOLSE-NEXT: mov x9, x8 -; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x10, x12 -; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x9, x10 +; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x8, x11, #0, eq -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill -; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: tbnz w8, #0, .LBB4_1 +; NOLSE-NEXT: ccmp x10, x11, #0, eq +; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB4_1 ; NOLSE-NEXT: b .LBB4_6 ; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -269,30 +268,29 @@ ; LSE-NEXT: b .LBB4_1 ; LSE-NEXT: .LBB4_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x11 -; LSE-NEXT: mov x1, x8 -; LSE-NEXT: adds x2, x11, #1 -; LSE-NEXT: cinc x10, x8, hs +; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload +; LSE-NEXT: mov x0, x10 +; LSE-NEXT: mov x1, x11 +; LSE-NEXT: adds x2, x10, #1 +; LSE-NEXT: cinc x9, x11, hs ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x10 -; LSE-NEXT: caspal x0, x1, x2, x3, [x9] -; LSE-NEXT: mov x9, x1 +; LSE-NEXT: mov x3, x9 +; LSE-NEXT: caspal x0, x1, x2, x3, [x8] +; LSE-NEXT: mov x9, x0 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: mov x10, x0 -; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: subs x11, x10, x11 -; LSE-NEXT: ccmp x9, x8, #0, eq -; LSE-NEXT: cset w8, ne -; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill -; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: tbnz w8, #0, .LBB4_1 +; LSE-NEXT: mov x8, x1 +; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: subs x11, x8, x11 +; LSE-NEXT: ccmp x9, x10, #0, eq +; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: b.ne .LBB4_1 ; LSE-NEXT: b .LBB4_2 ; LSE-NEXT: .LBB4_2: // %atomicrmw.end -; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload -; LSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: ldr x1, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload ; LSE-NEXT: add sp, sp, #48 ; LSE-NEXT: ret entry: @@ -607,45 +605,44 @@ ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 -; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: mov w8, w13 -; NOLSE-NEXT: mvn w9, w8 +; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: mov w8, w11 +; NOLSE-NEXT: mvn w10, w8 ; NOLSE-NEXT: // implicit-def: $x8 -; NOLSE-NEXT: mov w8, w9 +; NOLSE-NEXT: mov w8, w10 ; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe ; NOLSE-NEXT: mov x15, #-1 ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: ldaxp x10, x12, [x9] +; NOLSE-NEXT: cmp x10, x11 +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cset w9, ne -; NOLSE-NEXT: cmp x8, x11 -; NOLSE-NEXT: cinc w9, w9, ne -; NOLSE-NEXT: cbnz w9, .LBB9_4 +; NOLSE-NEXT: cinc w8, w8, ne +; NOLSE-NEXT: cbnz w8, .LBB9_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w9, x14, x15, [x10] -; NOLSE-NEXT: cbnz w9, .LBB9_2 +; NOLSE-NEXT: stlxp w8, x14, x15, [x9] +; NOLSE-NEXT: cbnz w8, .LBB9_2 ; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w9, x12, x8, [x10] -; NOLSE-NEXT: cbnz w9, .LBB9_2 +; NOLSE-NEXT: stlxp w8, x10, x12, [x9] +; NOLSE-NEXT: cbnz w8, .LBB9_2 ; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 -; NOLSE-NEXT: mov x9, x8 -; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x10, x12 -; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x9, x10 +; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill ; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x8, x11, #0, eq -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill -; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: tbnz w8, #0, .LBB9_1 +; NOLSE-NEXT: ccmp x10, x11, #0, eq +; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; NOLSE-NEXT: b.ne .LBB9_1 ; NOLSE-NEXT: b .LBB9_6 ; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -665,34 +662,33 @@ ; LSE-NEXT: b .LBB9_1 ; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x11 -; LSE-NEXT: mov x1, x8 -; LSE-NEXT: mov w10, w11 -; LSE-NEXT: mvn w12, w10 -; LSE-NEXT: // implicit-def: $x10 -; LSE-NEXT: mov w10, w12 -; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe -; LSE-NEXT: mov x10, #-1 +; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload +; LSE-NEXT: mov x0, x10 +; LSE-NEXT: mov x1, x11 +; LSE-NEXT: mov w9, w10 +; LSE-NEXT: mvn w12, w9 +; LSE-NEXT: // implicit-def: $x9 +; LSE-NEXT: mov w9, w12 +; LSE-NEXT: orr x2, x9, #0xfffffffffffffffe +; LSE-NEXT: mov x9, #-1 ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x10 -; LSE-NEXT: caspal x0, x1, x2, x3, [x9] -; LSE-NEXT: mov x9, x1 +; LSE-NEXT: mov x3, x9 +; LSE-NEXT: caspal x0, x1, x2, x3, [x8] +; LSE-NEXT: mov x9, x0 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: mov x10, x0 -; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: subs x11, x10, x11 -; LSE-NEXT: ccmp x9, x8, #0, eq -; LSE-NEXT: cset w8, ne -; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill -; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: tbnz w8, #0, .LBB9_1 +; LSE-NEXT: mov x8, x1 +; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; LSE-NEXT: subs x11, x8, x11 +; LSE-NEXT: ccmp x9, x10, #0, eq +; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill +; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill +; LSE-NEXT: b.ne .LBB9_1 ; LSE-NEXT: b .LBB9_2 ; LSE-NEXT: .LBB9_2: // %atomicrmw.end -; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload -; LSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: ldr x1, [sp, #16] // 8-byte Folded Reload +; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload ; LSE-NEXT: add sp, sp, #48 ; LSE-NEXT: ret entry: Index: llvm/test/CodeGen/AArch64/bcmp.ll =================================================================== --- llvm/test/CodeGen/AArch64/bcmp.ll +++ llvm/test/CodeGen/AArch64/bcmp.ll @@ -133,19 +133,16 @@ ret i1 %r } -; TODO: or (xor a, b), (and (xor c, d), C2) +; or (xor a, b), (and (xor c, d), C2) define i1 @bcmp9(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp9: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w9, [x0, #8] -; CHECK-NEXT: ldrb w10, [x1, #8] ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x11, [x1] -; CHECK-NEXT: eor w9, w9, w10 -; CHECK-NEXT: and x9, x9, #0xff -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: ldrb w10, [x0, #8] +; CHECK-NEXT: ldrb w11, [x1, #8] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9) @@ -156,15 +153,12 @@ define i1 @bcmp10(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp10: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w9, [x0, #8] -; CHECK-NEXT: ldrh w10, [x1, #8] ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x11, [x1] -; CHECK-NEXT: eor w9, w9, w10 -; CHECK-NEXT: and x9, x9, #0xffff -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: ldrh w10, [x0, #8] +; CHECK-NEXT: ldrh w11, [x1, #8] +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10) @@ -195,10 +189,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldr w10, [x0, #8] ; CHECK-NEXT: ldr w11, [x1, #8] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12) @@ -274,13 +266,10 @@ ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] ; CHECK-NEXT: ldr w12, [x0, #16] -; CHECK-NEXT: ldr w13, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: eor w10, w12, w13 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ldr w8, [x1, #16] +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ccmp x12, x8, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20) @@ -311,17 +300,13 @@ ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] ; CHECK-NEXT: ldr x12, [x0, #16] -; CHECK-NEXT: ldr x13, [x1, #16] -; CHECK-NEXT: ldr w14, [x0, #24] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: ldr w15, [x1, #24] -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: eor x10, x12, x13 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: eor w11, w14, w15 -; CHECK-NEXT: orr x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ldr x8, [x1, #16] +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldr w9, [x0, #24] +; CHECK-NEXT: ldr w10, [x1, #24] +; CHECK-NEXT: ccmp x12, x8, #0, eq +; CHECK-NEXT: ccmp x9, x10, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28) @@ -334,21 +319,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: ldp x12, x13, [x0, #16] -; CHECK-NEXT: ldp x14, x15, [x1, #16] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: ldrb w16, [x0, #32] -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: ldrb w17, [x1, #32] -; CHECK-NEXT: eor x10, x12, x14 -; CHECK-NEXT: eor x11, x13, x15 -; CHECK-NEXT: eor w12, w16, w17 -; CHECK-NEXT: orr x9, x10, x11 -; CHECK-NEXT: and x10, x12, #0xff -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: ldrb w11, [x1, #32] +; CHECK-NEXT: ldp x8, x9, [x0, #16] +; CHECK-NEXT: ldp x12, x10, [x1, #16] +; CHECK-NEXT: ccmp x8, x12, #0, eq +; CHECK-NEXT: ldrb w8, [x0, #32] +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: ccmp x8, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33) @@ -450,3 +429,111 @@ ret i1 %r } +; https://www.godbolt.org/z/GMosfa1nc +define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) { +; CHECK-LABEL: bcmp_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w2, #0xff +; CHECK-NEXT: and w9, w3, #0xff +; CHECK-NEXT: cmp w1, w0 +; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %5 = xor i32 %1, %0 + %6 = xor i8 %3, %2 + %7 = zext i8 %6 to i32 + %8 = or i32 %5, %7 + %9 = icmp eq i32 %8, 0 + ret i1 %9 +} + +define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) { +; CHECK-LABEL: bcmp_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: and w9, w1, #0xff +; CHECK-NEXT: and w8, w2, #0xff +; CHECK-NEXT: and w10, w3, #0xff +; CHECK-NEXT: cmp w9, w0, uxtb +; CHECK-NEXT: ccmp w10, w8, #0, eq +; CHECK-NEXT: and w8, w4, #0xff +; CHECK-NEXT: and w9, w5, #0xff +; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %xor0 = xor i8 %b0, %a0 + %xor1 = xor i8 %b1, %a1 + %xor2 = xor i8 %b2, %a2 + %or0 = or i8 %xor0, %xor1 + %or1 = or i8 %or0, %xor2 + %r = icmp eq i8 %or1, 0 + ret i1 %r +} + +define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) { +; CHECK-LABEL: bcmp_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: and w9, w1, #0xffff +; CHECK-NEXT: and w8, w2, #0xffff +; CHECK-NEXT: and w10, w3, #0xffff +; CHECK-NEXT: cmp w9, w0, uxth +; CHECK-NEXT: ccmp w10, w8, #0, eq +; CHECK-NEXT: and w8, w4, #0xffff +; CHECK-NEXT: and w9, w5, #0xffff +; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %xor0 = xor i16 %b0, %a0 + %xor1 = xor i16 %b1, %a1 + %xor2 = xor i16 %b2, %a2 + %or0 = or i16 %xor0, %xor1 + %or1 = or i16 %or0, %xor2 + %r = icmp eq i16 %or1, 0 + ret i1 %r +} + +define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) { +; CHECK-LABEL: bcmp_i128: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp x9, x8, [sp] +; CHECK-NEXT: ldp x10, x11, [sp, #16] +; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: ccmp x11, x8, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: cmp x2, x0 +; CHECK-NEXT: ccmp x3, x1, #0, eq +; CHECK-NEXT: ccmp x6, x4, #0, eq +; CHECK-NEXT: ccmp x7, x5, #0, eq +; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: ret + %xor0 = xor i128 %b0, %a0 + %xor1 = xor i128 %b1, %a1 + %xor2 = xor i128 %b2, %a2 + %or0 = or i128 %xor0, %xor1 + %or1 = or i128 %or0, %xor2 + %r = icmp ne i128 %or1, 0 + ret i1 %r +} + +define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) { +; CHECK-LABEL: bcmp_i42: +; CHECK: // %bb.0: +; CHECK-NEXT: and x9, x0, #0x3ffffffffff +; CHECK-NEXT: and x10, x1, #0x3ffffffffff +; CHECK-NEXT: and x8, x2, #0x3ffffffffff +; CHECK-NEXT: and x11, x3, #0x3ffffffffff +; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: and x9, x5, #0x3ffffffffff +; CHECK-NEXT: ccmp x11, x8, #0, eq +; CHECK-NEXT: and x8, x4, #0x3ffffffffff +; CHECK-NEXT: ccmp x9, x8, #0, eq +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %xor0 = xor i42 %b0, %a0 + %xor1 = xor i42 %b1, %a1 + %xor2 = xor i42 %b2, %a2 + %or0 = or i42 %xor0, %xor1 + %or1 = or i42 %or0, %xor2 + %r = icmp ne i42 %or1, 0 + ret i1 %r +} Index: llvm/test/CodeGen/AArch64/dag-combine-setcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -191,10 +191,7 @@ ; CHECK-LABEL: combine_setcc_glue: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: ccmp x1, x3, #0, eq -; CHECK-NEXT: cset w9, eq -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: %cmp3 = icmp eq i128 %x, %y @@ -218,11 +215,12 @@ ; CHECK-NEXT: csel x10, x0, x8, lo ; CHECK-NEXT: cmp x1, x9 ; CHECK-NEXT: csel x8, x0, x8, lo -; CHECK-NEXT: csel x8, x10, x8, eq -; CHECK-NEXT: csel x10, x1, x9, lo -; CHECK-NEXT: subs x8, x2, x8 -; CHECK-NEXT: sbc x9, x3, x10 -; CHECK-NEXT: ccmp x3, x10, #0, eq +; CHECK-NEXT: csel x11, x1, x9, lo +; CHECK-NEXT: csel x10, x10, x8, eq +; CHECK-NEXT: subs x8, x2, x10 +; CHECK-NEXT: sbc x9, x3, x11 +; CHECK-NEXT: cmp x3, x11 +; CHECK-NEXT: ccmp x2, x10, #0, eq ; CHECK-NEXT: b.ne .LBB12_1 ; CHECK-NEXT: // %bb.2: // %do.end ; CHECK-NEXT: mov x0, xzr Index: llvm/test/CodeGen/AArch64/i128-cmp.ll =================================================================== --- llvm/test/CodeGen/AArch64/i128-cmp.ll +++ llvm/test/CodeGen/AArch64/i128-cmp.ll @@ -116,8 +116,8 @@ define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: cmp x1, x3 +; CHECK-NEXT: ccmp x0, x2, #0, eq ; CHECK-NEXT: b.ne .LBB10_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill @@ -137,8 +137,8 @@ define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: cmp x1, x3 +; CHECK-NEXT: ccmp x0, x2, #0, eq ; CHECK-NEXT: b.eq .LBB11_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill