Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19490,6 +19490,36 @@ } } + // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: + // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag + if (!DCI.isBeforeLegalize() && VT.isScalarInteger() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && + LHS->getOpcode() == ISD::OR && + (LHS.getOperand(0)->getOpcode() == ISD::XOR && + LHS.getOperand(1)->getOpcode() == ISD::XOR) && + LHS.getOperand(0)->hasOneUse() && LHS.getOperand(1)->hasOneUse()) { + SDValue XOR0 = LHS.getOperand(0); + SDValue XOR1 = LHS.getOperand(1); + SDValue CCVal = DAG.getConstant(0, DL, MVT_CC); + EVT TstVT = LHS->getValueType(0); + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::Glue), + XOR0.getOperand(0), XOR0.getOperand(1)); + SDValue Overflow = Cmp.getValue(1); + // Clear the Z bit. (As the EQ is a && and the NE is a ||, it might be + // simpler to just use a constant of 0 in both cases) + SDValue NZCVOp = DAG.getConstant(AArch64CC::EQ, DL, MVT::i32); + SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0), + XOR1.getOperand(1), NZCVOp, CCVal, Overflow); + // Invert CSEL's operands. + SDValue TVal = DAG.getConstant(1, DL, VT); + SDValue FVal = DAG.getConstant(0, DL, VT); + AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond); + AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, + DAG.getConstant(InvCC, DL, MVT::i32), CCmp); + } + return SDValue(); } Index: llvm/test/CodeGen/AArch64/atomicrmw-O0.ll =================================================================== --- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -216,38 +216,40 @@ ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: adds x14, x8, #1 +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x13, #1 ; NOLSE-NEXT: cinc x15, x11, hs ; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x9, [x13] -; NOLSE-NEXT: cmp x10, x8 -; NOLSE-NEXT: cset w12, ne -; NOLSE-NEXT: cmp x9, x11 -; NOLSE-NEXT: cinc w12, w12, ne -; NOLSE-NEXT: cbnz w12, .LBB4_4 +; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: cmp x12, x13 +; NOLSE-NEXT: cset w9, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w9, w9, ne +; NOLSE-NEXT: cbnz w9, .LBB4_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x14, x15, [x13] -; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: stlxp w9, x14, x15, [x10] +; NOLSE-NEXT: cbnz w9, .LBB4_2 ; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x10, x9, [x13] -; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: stlxp w9, x12, x8, [x10] +; NOLSE-NEXT: cbnz w9, .LBB4_2 ; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 -; NOLSE-NEXT: eor x11, x9, x11 -; NOLSE-NEXT: eor x8, x10, x8 -; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: mov x9, x8 ; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x10, x12 ; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: subs x12, x12, x13 +; NOLSE-NEXT: ccmp x8, x11, #0, eq +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: cbnz x8, .LBB4_1 +; NOLSE-NEXT: tbnz w8, #0, .LBB4_1 ; NOLSE-NEXT: b .LBB4_6 ; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -267,26 +269,26 @@ ; LSE-NEXT: b .LBB4_1 ; LSE-NEXT: .LBB4_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x8 -; LSE-NEXT: mov x1, x10 -; LSE-NEXT: adds x2, x8, #1 -; LSE-NEXT: cinc x11, x10, hs +; LSE-NEXT: mov x0, x11 +; LSE-NEXT: mov x1, x8 +; LSE-NEXT: adds x2, x11, #1 +; LSE-NEXT: cinc x10, x8, hs ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x11 +; LSE-NEXT: mov x3, x10 ; LSE-NEXT: caspal x0, x1, x2, x3, [x9] ; LSE-NEXT: mov x9, x1 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: eor x11, x9, x10 ; LSE-NEXT: mov x10, x0 ; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: eor x8, x10, x8 -; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: subs x11, x10, x11 +; LSE-NEXT: ccmp x9, x8, #0, eq +; LSE-NEXT: cset w8, ne ; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: cbnz x8, .LBB4_1 +; LSE-NEXT: tbnz w8, #0, .LBB4_1 ; LSE-NEXT: b .LBB4_2 ; LSE-NEXT: .LBB4_2: // %atomicrmw.end ; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -606,42 +608,44 @@ ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 ; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: mov w9, w8 -; NOLSE-NEXT: mvn w10, w9 -; NOLSE-NEXT: // implicit-def: $x9 -; NOLSE-NEXT: mov w9, w10 -; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe +; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: mov w8, w13 +; NOLSE-NEXT: mvn w9, w8 +; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: mov w8, w9 +; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe ; NOLSE-NEXT: mov x15, #-1 ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x9, [x13] -; NOLSE-NEXT: cmp x10, x8 -; NOLSE-NEXT: cset w12, ne -; NOLSE-NEXT: cmp x9, x11 -; NOLSE-NEXT: cinc w12, w12, ne -; NOLSE-NEXT: cbnz w12, .LBB9_4 +; NOLSE-NEXT: ldaxp x12, x8, [x10] +; NOLSE-NEXT: cmp x12, x13 +; NOLSE-NEXT: cset w9, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w9, w9, ne +; NOLSE-NEXT: cbnz w9, .LBB9_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x14, x15, [x13] -; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: stlxp w9, x14, x15, [x10] +; NOLSE-NEXT: cbnz w9, .LBB9_2 ; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w12, x10, x9, [x13] -; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: stlxp w9, x12, x8, [x10] +; NOLSE-NEXT: cbnz w9, .LBB9_2 ; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 -; NOLSE-NEXT: eor x11, x9, x11 -; NOLSE-NEXT: eor x8, x10, x8 -; NOLSE-NEXT: orr x8, x8, x11 +; NOLSE-NEXT: mov x9, x8 ; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; NOLSE-NEXT: mov x10, x12 ; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill +; NOLSE-NEXT: subs x12, x12, x13 +; NOLSE-NEXT: ccmp x8, x11, #0, eq +; NOLSE-NEXT: cset w8, ne ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; NOLSE-NEXT: cbnz x8, .LBB9_1 +; NOLSE-NEXT: tbnz w8, #0, .LBB9_1 ; NOLSE-NEXT: b .LBB9_6 ; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload @@ -661,30 +665,30 @@ ; LSE-NEXT: b .LBB9_1 ; LSE-NEXT: .LBB9_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload -; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload +; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload ; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; LSE-NEXT: mov x0, x8 -; LSE-NEXT: mov x1, x10 -; LSE-NEXT: mov w11, w8 -; LSE-NEXT: mvn w12, w11 -; LSE-NEXT: // implicit-def: $x11 -; LSE-NEXT: mov w11, w12 -; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe -; LSE-NEXT: mov x11, #-1 +; LSE-NEXT: mov x0, x11 +; LSE-NEXT: mov x1, x8 +; LSE-NEXT: mov w10, w11 +; LSE-NEXT: mvn w12, w10 +; LSE-NEXT: // implicit-def: $x10 +; LSE-NEXT: mov w10, w12 +; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe +; LSE-NEXT: mov x10, #-1 ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 -; LSE-NEXT: mov x3, x11 +; LSE-NEXT: mov x3, x10 ; LSE-NEXT: caspal x0, x1, x2, x3, [x9] ; LSE-NEXT: mov x9, x1 ; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill -; LSE-NEXT: eor x11, x9, x10 ; LSE-NEXT: mov x10, x0 ; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill -; LSE-NEXT: eor x8, x10, x8 -; LSE-NEXT: orr x8, x8, x11 +; LSE-NEXT: subs x11, x10, x11 +; LSE-NEXT: ccmp x9, x8, #0, eq +; LSE-NEXT: cset w8, ne ; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill -; LSE-NEXT: cbnz x8, .LBB9_1 +; LSE-NEXT: tbnz w8, #0, .LBB9_1 ; LSE-NEXT: b .LBB9_2 ; LSE-NEXT: .LBB9_2: // %atomicrmw.end ; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload Index: llvm/test/CodeGen/AArch64/bcmp-inline-small.ll =================================================================== --- llvm/test/CodeGen/AArch64/bcmp-inline-small.ll +++ llvm/test/CodeGen/AArch64/bcmp-inline-small.ll @@ -12,10 +12,8 @@ ; CHECKN-NEXT: ldr x9, [x1] ; CHECKN-NEXT: ldur x10, [x0, #7] ; CHECKN-NEXT: ldur x11, [x1, #7] -; CHECKN-NEXT: eor x8, x8, x9 -; CHECKN-NEXT: eor x9, x10, x11 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: cmp x8, #0 +; CHECKN-NEXT: cmp x8, x9 +; CHECKN-NEXT: ccmp x10, x11, #0, eq ; CHECKN-NEXT: cset w0, eq ; CHECKN-NEXT: ret ; @@ -44,10 +42,8 @@ ; CHECKN-NEXT: ldr x9, [x1] ; CHECKN-NEXT: ldur x10, [x0, #7] ; CHECKN-NEXT: ldur x11, [x1, #7] -; CHECKN-NEXT: eor x8, x8, x9 -; CHECKN-NEXT: eor x9, x10, x11 -; CHECKN-NEXT: orr x8, x8, x9 -; CHECKN-NEXT: cmp x8, #0 +; CHECKN-NEXT: cmp x8, x9 +; CHECKN-NEXT: ccmp x10, x11, #0, eq ; CHECKN-NEXT: cset w0, eq ; CHECKN-NEXT: ret ; Index: llvm/test/CodeGen/AArch64/dag-combine-setcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -128,3 +128,33 @@ %cmp2 = icmp ne i64 %cast, zeroinitializer ret i1 %cmp2 } + +define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) { +; CHECK-LABEL: combine_setcc_eq0_conjunction_xor_or: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x1] +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16) + %cmp = icmp eq i32 %bcmp, 0 + ret i1 %cmp +} + +define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) { +; CHECK-LABEL: combine_setcc_ne0_conjunction_xor_or: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x1] +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x11, #0, eq +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16) + %cmp = icmp ne i32 %bcmp, 0 + ret i1 %cmp +} + +declare i32 @bcmp(ptr nocapture, ptr nocapture, i64) Index: llvm/test/CodeGen/AArch64/i128-cmp.ll =================================================================== --- llvm/test/CodeGen/AArch64/i128-cmp.ll +++ llvm/test/CodeGen/AArch64/i128-cmp.ll @@ -6,10 +6,8 @@ define i1 @cmp_i128_eq(i128 %a, i128 %b) { ; CHECK-LABEL: cmp_i128_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cmp = icmp eq i128 %a, %b @@ -19,10 +17,8 @@ define i1 @cmp_i128_ne(i128 %a, i128 %b) { ; CHECK-LABEL: cmp_i128_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %cmp = icmp ne i128 %a, %b @@ -120,10 +116,9 @@ define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbnz x8, .LBB10_2 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: b.ne .LBB10_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl call @@ -142,10 +137,9 @@ define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: br_on_cmp_i128_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x1, x3 -; CHECK-NEXT: eor x9, x0, x2 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbz x8, .LBB11_2 +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: ccmp x1, x3, #0, eq +; CHECK-NEXT: b.eq .LBB11_2 ; CHECK-NEXT: // %bb.1: // %call ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl call Index: llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -68,12 +68,10 @@ ; AARCH-NEXT: adds x11, x12, x11 ; AARCH-NEXT: adc x12, x13, x14 ; AARCH-NEXT: adds x10, x11, x10 -; AARCH-NEXT: adc x9, x12, x9 ; AARCH-NEXT: asr x11, x1, #63 -; AARCH-NEXT: eor x9, x9, x11 -; AARCH-NEXT: eor x10, x10, x11 -; AARCH-NEXT: orr x9, x10, x9 -; AARCH-NEXT: cmp x9, #0 +; AARCH-NEXT: adc x9, x12, x9 +; AARCH-NEXT: cmp x10, x11 +; AARCH-NEXT: ccmp x9, x11, #0, eq ; AARCH-NEXT: cset w9, ne ; AARCH-NEXT: tbz x8, #63, .LBB1_2 ; AARCH-NEXT: // %bb.1: // %Entry