diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19490,6 +19490,35 @@ } } + // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: + // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag + if (!DCI.isBeforeLegalize() && VT.isScalarInteger() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && + LHS->getOpcode() == ISD::OR && + (LHS.getOperand(0)->getOpcode() == ISD::XOR && + LHS.getOperand(1)->getOpcode() == ISD::XOR) && + LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() && + LHS.getOperand(1)->hasOneUse()) { + SDValue XOR0 = LHS.getOperand(0); + SDValue XOR1 = LHS.getOperand(1); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC); + EVT TstVT = LHS->getValueType(0); + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32), + XOR0.getOperand(0), XOR0.getOperand(1)); + SDValue Overflow = Cmp.getValue(1); + SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32); + SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0), + XOR1.getOperand(1), NZCVOp, CCVal, Overflow); + // Invert CSEL's operands. + SDValue TVal = DAG.getConstant(1, DL, VT); + SDValue FVal = DAG.getConstant(0, DL, VT); + AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond); + AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, + DAG.getConstant(InvCC, DL, MVT::i32), CCmp); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll --- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -216,38 +216,40 @@ ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: adds x14, x8, #1 +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x13, #1 ; NOLSE-NEXT: cinc x15, x11, hs ; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x9, [x13] -; NOLSE-NEXT: cmp x10, x8 -; NOLSE-NEXT: cset w12, ne -; NOLSE-NEXT: cmp x9, x11 -; NOLSE-NEXT: cinc w12, w12, ne -; NOLSE-NEXT: cbnz w12, .LBB4_4 +; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: cmp x12, x13 +; NOLSE-NEXT: cset w9, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w9, w9, ne +; NOLSE-NEXT: cbnz w9, .LBB4_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x14, x15, [x13] -; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: stlxp w9, x14, x15, [x10] +; NOLSE-NEXT: cbnz w9, .LBB4_2 ; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x10, x9, [x13] -; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: stlxp w9, x12, x8, [x10] +; NOLSE-NEXT: cbnz w9, .LBB4_2 ; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 -; NOLSE-NEXT: eor x11, x9, x11 -; NOLSE-NEXT: eor x8, x10, x8 -; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: mov x9, x8 ; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x10, x12 ; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: subs x12, x12, x13 +; NOLSE-NEXT: ccmp x8, x11, #0, eq +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: cbnz x8, .LBB4_1 +; NOLSE-NEXT: tbnz w8, #0, .LBB4_1 ; NOLSE-NEXT: b .LBB4_6 ; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -267,26 +269,26 @@ ; LSE-NEXT: b .LBB4_1 ; LSE-NEXT: .LBB4_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x8 -; LSE-NEXT: mov x1, x10 -; LSE-NEXT: adds x2, x8, #1 -; LSE-NEXT: cinc x11, x10, hs +; LSE-NEXT: mov x0, x11 +; LSE-NEXT: mov x1, x8 +; LSE-NEXT: adds x2, x11, #1 +; LSE-NEXT: cinc x10, x8, hs ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x11 +; LSE-NEXT: mov x3, x10 ; LSE-NEXT: caspal x0, x1, x2, x3, [x9] ; LSE-NEXT: mov x9, x1 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: eor x11, x9, x10 ; LSE-NEXT: mov x10, x0 ; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: eor x8, x10, x8 -; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: subs x11, x10, x11 +; LSE-NEXT: ccmp x9, x8, #0, eq +; LSE-NEXT: cset w8, ne ; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: cbnz x8, .LBB4_1 +; LSE-NEXT: tbnz w8, #0, .LBB4_1 ; LSE-NEXT: b .LBB4_2 ; LSE-NEXT: .LBB4_2: // %atomicrmw.end ; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -606,42 +608,44 @@ ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 ; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: mov w9, w8 -; NOLSE-NEXT: mvn w10, w9 -; NOLSE-NEXT: // implicit-def: $x9 -; NOLSE-NEXT: mov w9, w10 -; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: mov w8, w13 +; NOLSE-NEXT: mvn w9, w8 +; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: mov w8, w9 +; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe ; NOLSE-NEXT: mov x15, #-1 ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x9, [x13] -; NOLSE-NEXT: cmp x10, x8 -; NOLSE-NEXT: cset w12, ne -; NOLSE-NEXT: cmp x9, x11 -; NOLSE-NEXT: cinc w12, w12, ne -; NOLSE-NEXT: cbnz w12, .LBB9_4 +; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: cmp x12, x13 +; NOLSE-NEXT: cset w9, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w9, w9, ne +; NOLSE-NEXT: cbnz w9, .LBB9_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x14, x15, [x13] -; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: stlxp w9, x14, x15, [x10] +; NOLSE-NEXT: cbnz w9, .LBB9_2 ; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x10, x9, [x13] -; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: stlxp w9, x12, x8, [x10] +; NOLSE-NEXT: cbnz w9, .LBB9_2 ; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 -; NOLSE-NEXT: eor x11, x9, x11 -; NOLSE-NEXT: eor x8, x10, x8 -; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: mov x9, x8 ; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x10, x12 ; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: subs x12, x12, x13 +; NOLSE-NEXT: ccmp x8, x11, #0, eq +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: cbnz x8, .LBB9_1 +; NOLSE-NEXT: tbnz w8, #0, .LBB9_1 ; NOLSE-NEXT: b .LBB9_6 ; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -661,30 +665,30 @@ ; LSE-NEXT: b .LBB9_1 ; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x8 -; LSE-NEXT: mov x1, x10 -; LSE-NEXT: mov w11, w8 -; LSE-NEXT: mvn w12, w11 -; LSE-NEXT: // implicit-def: $x11 -; LSE-NEXT: mov w11, w12 -; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe -; LSE-NEXT: mov x11, #-1 +; LSE-NEXT: mov x0, x11 +; LSE-NEXT: mov x1, x8 +; LSE-NEXT: mov w10, w11 +; LSE-NEXT: mvn w12, w10 +; LSE-NEXT: // implicit-def: $x10 +; LSE-NEXT: mov w10, w12 +; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe +; LSE-NEXT: mov x10, #-1 ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x11 +; LSE-NEXT: mov x3, x10 ; LSE-NEXT: caspal x0, x1, x2, x3, [x9] ; LSE-NEXT: mov x9, x1 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: eor x11, x9, x10 ; LSE-NEXT: mov x10, x0 ; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: eor x8, x10, x8 -; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: subs x11, x10, x11 +; LSE-NEXT: ccmp x9, x8, #0, eq +; LSE-NEXT: cset w8, ne ; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: cbnz x8, .LBB9_1 +; LSE-NEXT: tbnz w8, #0, .LBB9_1 ; LSE-NEXT: b .LBB9_2 ; LSE-NEXT: .LBB9_2: // %atomicrmw.end ; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll --- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll +++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll @@ -12,10 +12,8 @@ ; CHECKN-NEXT: ldr x9, [x1] ; CHECKN-NEXT: ldur x10, [x0, #7] ; CHECKN-NEXT: ldur x11, [x1, #7] -; CHECKN-NEXT: eor x8, x8, x9 -; CHECKN-NEXT: eor x9, x10, x11 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: cmp x8, #0 +; CHECKN-NEXT: cmp x8, x9 +; CHECKN-NEXT: ccmp x10, x11, #0, eq ; CHECKN-NEXT: cset w0, eq ; CHECKN-NEXT: ret ; @@ -44,10 +42,8 @@ ; CHECKN-NEXT: ldr x9, [x1] ; CHECKN-NEXT: ldur x10, [x0, #7] ; CHECKN-NEXT: ldur x11, [x1, #7] -; CHECKN-NEXT: eor x8, x8, x9 -; CHECKN-NEXT: eor x9, x10, x11 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: cmp x8, #0 +; CHECKN-NEXT: cmp x8, x9 +; CHECKN-NEXT: ccmp x10, x11, #0, eq ; CHECKN-NEXT: cset w0, eq ; CHECKN-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll --- a/llvm/test/CodeGen/AArch64/bcmp.ll +++ b/llvm/test/CodeGen/AArch64/bcmp.ll @@ -113,10 +113,8 @@ ; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: ldur w10, [x0, #3] ; CHECK-NEXT: ldur w11, [x1, #3] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: ccmp w10, w11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7) @@ -182,10 +180,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldur x10, [x0, #3] ; CHECK-NEXT: ldur x11, [x1, #3] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11) @@ -218,10 +214,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldur x10, [x0, #5] ; CHECK-NEXT: ldur x11, [x1, #5] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13) @@ -236,10 +230,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldur x10, [x0, #6] ; CHECK-NEXT: ldur x11, [x1, #6] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14) @@ -254,10 +246,8 @@ ; CHECK-NEXT: ldr x9, [x1] ; CHECK-NEXT: ldur x10, [x0, #7] ; CHECK-NEXT: ldur x11, [x1, #7] -; CHECK-NEXT: eor x8, x8, x9 -; CHECK-NEXT: eor x9, x10, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: ccmp x10, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15) @@ -270,10 +260,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp x8, x9, [x0] ; CHECK-NEXT: ldp x10, x11, [x1] -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: orr x8, x8, x9 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16) diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -128,3 +128,123 @@ %cmp2 = icmp ne i64 %cast, zeroinitializer ret i1 %cmp2 } + +define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) { +; CHECK-LABEL: combine_setcc_eq0_conjunction_xor_or: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x1] +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16) + %cmp = icmp eq i32 %bcmp, 0 + ret i1 %cmp +} + +define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) { +; CHECK-LABEL: combine_setcc_ne0_conjunction_xor_or: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x1] +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16) + %cmp = icmp ne i32 %bcmp, 0 + ret i1 %cmp +} + +; Doesn't increase the number of instructions, where the LHS has multiple uses +define i32 @combine_setcc_multiuse(i32 %0, i32 %1, i32 %2, i32 %3) { +; CHECK-LABEL: combine_setcc_multiuse: +; CHECK: // %bb.0: +; CHECK-NEXT: eor w8, w1, w0 +; CHECK-NEXT: eor w9, w3, w2 +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: cbz w8, .LBB10_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: b use +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: ret + %5 = xor i32 %1, %0 + %6 = xor i32 %3, %2 + %7 = or i32 %6, %5 + %8 = icmp eq i32 %7, 0 + br i1 %8, label %11, label %9 + +9: ; preds = %4 + %10 = tail call i32 @use(i32 %7) #2 + br label %11 + +11: ; preds = %4, %9 + %12 = phi i32 [ %10, %9 ], [ %0, %4 ] + ret i32 %12 +} + +; There may be issues with the CMP/CCMP with the scheduling of instructions +; that ISel will create out of the DAG +define i32 @combine_setcc_glue(i128 noundef %x, i128 noundef %y) { +; CHECK-LABEL: combine_setcc_glue: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: cset w9, eq +; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: ret +entry: + %cmp3 = icmp eq i128 %x, %y + %conv = trunc i128 %x to i64 + %conv1 = trunc i128 %y to i64 + %cmp = icmp eq i64 %conv, %conv1 + %or7 = or i1 %cmp3, %cmp + %or = zext i1 %or7 to i32 + ret i32 %or +} + +; Reduced test from https://github.com/llvm/llvm-project/issues/58675 +define [2 x i64] @PR58675(i128 %a.addr, i128 %b.addr) { +; CHECK-LABEL: PR58675: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: .LBB12_1: // %do.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x10, x0, x8, lo +; CHECK-NEXT: cmp x1, x9 +; CHECK-NEXT: csel x8, x0, x8, lo +; CHECK-NEXT: csel x8, x10, x8, eq +; CHECK-NEXT: csel x10, x1, x9, lo +; CHECK-NEXT: subs x8, x2, x8 +; CHECK-NEXT: sbc x9, x3, x10 +; CHECK-NEXT: ccmp x3, x10, #0, eq +; CHECK-NEXT: b.ne .LBB12_1 +; CHECK-NEXT: // %bb.2: // %do.end +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: ret +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %a.addr.i1 = phi i128 [ 1, %do.body ], [ 0, %entry ] + %b.addr.i2 = phi i128 [ %sub, %do.body ], [ 0, %entry ] + %0 = tail call i128 @llvm.umin.i128(i128 %a.addr, i128 %b.addr.i2) + %1 = tail call i128 @llvm.umax.i128(i128 0, i128 %a.addr) + %sub = sub i128 %b.addr, %0 + %cmp18.not = icmp eq i128 %b.addr, %0 + br i1 %cmp18.not, label %do.end, label %do.body + +do.end: ; preds = %do.body + ret [2 x i64] zeroinitializer +} + +declare i128 @llvm.umin.i128(i128, i128) +declare i128 @llvm.umax.i128(i128, i128) +declare i32 @bcmp(ptr nocapture, ptr nocapture, i64) +declare i32 @use(i32 noundef) diff --git a/llvm/test/CodeGen/AArch64/i128-cmp.ll b/llvm/test/CodeGen/AArch64/i128-cmp.ll --- a/llvm/test/CodeGen/AArch64/i128-cmp.ll +++ b/llvm/test/CodeGen/AArch64/i128-cmp.ll @@ -6,10 +6,8 @@ define i1 @cmp_i128_eq(i128 %a, i128 %b) { ; CHECK-LABEL: cmp_i128_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cmp = icmp eq i128 %a, %b @@ -19,10 +17,8 @@ define i1 @cmp_i128_ne(i128 %a, i128 %b) { ; CHECK-LABEL: cmp_i128_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %cmp = icmp ne i128 %a, %b @@ -120,10 +116,9 @@ define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbnz x8, .LBB10_2 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: b.ne .LBB10_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl call @@ -142,10 +137,9 @@ define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbz x8, .LBB11_2 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: b.eq .LBB11_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl call diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -68,12 +68,10 @@ ; AARCH-NEXT: adds x11, x12, x11 ; AARCH-NEXT: adc x12, x13, x14 ; AARCH-NEXT: adds x10, x11, x10 -; AARCH-NEXT: adc x9, x12, x9 ; AARCH-NEXT: asr x11, x1, #63 -; AARCH-NEXT: eor x9, x9, x11 -; AARCH-NEXT: eor x10, x10, x11 -; AARCH-NEXT: orr x9, x10, x9 -; AARCH-NEXT: cmp x9, #0 +; AARCH-NEXT: adc x9, x12, x9 +; AARCH-NEXT: cmp x10, x11 +; AARCH-NEXT: ccmp x9, x11, #0, eq ; AARCH-NEXT: cset w9, ne ; AARCH-NEXT: tbz x8, #63, .LBB1_2 ; AARCH-NEXT: // %bb.1: // %Entry