diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14034,15 +14034,85 @@ return SDValue(); } +// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to +// convert to csel(ccmp(.., cc0)), depending on cc1: + +// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) +// => +// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0)) +// +// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) +// => +// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0)) +static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue CSel0 = N->getOperand(0); + SDValue CSel1 = N->getOperand(1); + + if (CSel0.getOpcode() != AArch64ISD::CSEL || + CSel1.getOpcode() != AArch64ISD::CSEL) + return SDValue(); + + if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) + return SDValue(); + + if (!isNullConstant(CSel0.getOperand(0)) || + !isOneConstant(CSel0.getOperand(1)) || + !isNullConstant(CSel1.getOperand(0)) || + !isOneConstant(CSel1.getOperand(1))) + return SDValue(); + + SDValue Cmp0 = CSel0.getOperand(3); + SDValue Cmp1 = CSel1.getOperand(3); + AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); + AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); + if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) + return SDValue(); + if (Cmp1.getOpcode() != AArch64ISD::SUBS && + Cmp0.getOpcode() == AArch64ISD::SUBS) { + std::swap(Cmp0, Cmp1); + std::swap(CC0, CC1); + } + + if (Cmp1.getOpcode() != AArch64ISD::SUBS) + return SDValue(); + + SDLoc DL(N); + SDValue CCmp; + + if (N->getOpcode() == ISD::AND) { + AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); + SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); + } else { + SDLoc DL(N); + AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); + SDValue Condition = DAG.getConstant(CC0, DL, MVT_CC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); + } + return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), + CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), + CCmp); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (SDValue R = performANDORCSELCombine(N, DAG)) + return R; + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; @@ -14171,60 +14241,13 @@ return SDValue(); } -// Given a tree of and(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to -// convert to csel(ccmp(.., cc0)), depending on cc1. -static SDValue PerformANDCSELCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDValue CSel0 = N->getOperand(0); - SDValue CSel1 = N->getOperand(1); - - if (CSel0.getOpcode() != AArch64ISD::CSEL || - CSel1.getOpcode() != AArch64ISD::CSEL) - return SDValue(); - - if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) - return SDValue(); - - if (!isNullConstant(CSel0.getOperand(0)) || - !isOneConstant(CSel0.getOperand(1)) || - !isNullConstant(CSel1.getOperand(0)) || - !isOneConstant(CSel1.getOperand(1))) - return SDValue(); - - SDValue Cmp0 = CSel0.getOperand(3); - SDValue Cmp1 = CSel1.getOperand(3); - AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); - AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); - if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) - return SDValue(); - if (Cmp1.getOpcode() != AArch64ISD::SUBS && - Cmp0.getOpcode() == AArch64ISD::SUBS) { - std::swap(Cmp0, Cmp1); - std::swap(CC0, CC1); - } - - if (Cmp1.getOpcode() != AArch64ISD::SUBS) - return SDValue(); - - SDLoc DL(N); - AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); - SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC); - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); - SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); - SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), - Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); - return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), - CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), - CCmp); -} - static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); EVT VT = N->getValueType(0); - if (SDValue R = PerformANDCSELCombine(N, DAG)) + if (SDValue R = performANDORCSELCombine(N, DAG)) return R; if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -754,16 +754,12 @@ @g = global i32 0 -; Should not use ccmp if we have to compute the or expression in an integer -; register anyway because of other users. define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) { ; CHECK-LABEL: select_noccmp2: ; CHECK: ; %bb.0: ; CHECK-NEXT: cmp x0, #0 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmp x0, #13 -; CHECK-NEXT: cset w9, gt -; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: ccmp x0, #13, #0, ge +; CHECK-NEXT: cset w8, gt ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csel x0, xzr, x3, ne ; CHECK-NEXT: sbfx w8, w8, #0, #1 @@ -799,21 +795,17 @@ ; CHECK-LABEL: select_noccmp3: ; CHECK: ; %bb.0: ; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmp w0, #13 -; CHECK-NEXT: cset w9, gt +; CHECK-NEXT: ccmp w0, #13, #0, ge +; CHECK-NEXT: cset w8, gt ; CHECK-NEXT: cmp w0, #22 -; CHECK-NEXT: cset w10, lt -; CHECK-NEXT: cmp w0, #44 -; CHECK-NEXT: cset w11, gt +; CHECK-NEXT: mov w9, #44 +; CHECK-NEXT: ccmp w0, w9, #0, ge +; CHECK-NEXT: cset w9, gt ; CHECK-NEXT: cmp w0, #99 -; CHECK-NEXT: cset w12, eq -; CHECK-NEXT: cmp w0, #77 -; CHECK-NEXT: cset w13, eq -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: orr w9, w10, w11 ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: orr w9, w12, w13 +; CHECK-NEXT: mov w9, #77 +; CHECK-NEXT: ccmp w0, w9, #4, ne +; CHECK-NEXT: cset w9, eq ; CHECK-NEXT: tst w8, w9 ; CHECK-NEXT: csel w0, w1, w2, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll --- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll @@ -257,13 +257,12 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] ; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill ; CHECK-NEXT: bl __eqtf2 -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: cset w19, eq +; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: cset w8, ne -; CHECK-NEXT: orr w0, w8, w19 +; CHECK-NEXT: ccmp w19, #0, #4, eq +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s + +; Ensure chains of comparisons produce chains of `ccmp` + +; (x0 < x1) && (x2 > x3) +define i32 @cmp_and2(i32 %0, i32 %1, i32 %2, i32 %3) { +; CHECK-LABEL: cmp_and2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: ccmp w2, w3, #0, lo +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret + %5 = icmp ult i32 %0, %1 + %6 = icmp ugt i32 %2, %3 + %7 = select i1 %5, i1 %6, i1 false + %8 = zext i1 %7 to i32 + ret i32 %8 +} + +; (x0 < x1) && (x2 > x3) && (x4 != x5) +define i32 @cmp_and3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { +; CHECK-LABEL: cmp_and3: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: ccmp w2, w3, #0, lo +; CHECK-NEXT: ccmp w4, w5, #4, hi +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %7 = icmp ult i32 %0, %1 + %8 = icmp ugt i32 %2, %3 + %9 = select i1 %7, i1 %8, i1 false + %10 = icmp ne i32 %4, %5 + %11 = select i1 %9, i1 %10, i1 false + %12 = zext i1 %11 to i32 + ret i32 %12 +} + +; (x0 < x1) && (x2 > x3) && (x4 != x5) && (x6 == x7) +define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) { +; CHECK-LABEL: cmp_and4: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w2, w3 +; CHECK-NEXT: ccmp w0, w1, #2, hi +; CHECK-NEXT: ccmp w4, w5, #4, lo +; CHECK-NEXT: ccmp w6, w7, #0, ne +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %9 = icmp ugt i32 %2, %3 + %10 = icmp ult i32 %0, %1 + %11 = select i1 %9, i1 %10, i1 false + %12 = icmp ne i32 %4, %5 + %13 = select i1 %11, i1 %12, i1 false + %14 = icmp eq i32 %6, %7 + %15 = select i1 %13, i1 %14, i1 false + %16 = zext i1 %15 to i32 + ret i32 %16 +} + +; (x0 < x1) || (x2 > x3) +define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) { +; CHECK-LABEL: cmp_or2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: ccmp w2, w3, #0, hs +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %5 = icmp ult i32 %0, %1 + %6 = icmp ne i32 %2, %3 + %7 = select i1 %5, i1 true, i1 %6 + %8 = zext i1 %7 to i32 + ret i32 %8 +} + +; (x0 < x1) || (x2 > x3) || (x4 != x5) +define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) { +; CHECK-LABEL: cmp_or3: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: ccmp w2, w3, #2, hs +; CHECK-NEXT: ccmp w4, w5, #0, ls +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %7 = icmp ult i32 %0, %1 + %8 = icmp ugt i32 %2, %3 + %9 = select i1 %7, i1 true, i1 %8 + %10 = icmp ne i32 %4, %5 + %11 = select i1 %9, i1 true, i1 %10 + %12 = zext i1 %11 to i32 + ret i32 %12 +} + +; (x0 < x1) || (x2 > x3) || (x4 != x5) || (x6 == x7) +define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) { +; CHECK-LABEL: cmp_or4: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: ccmp w2, w3, #2, hs +; CHECK-NEXT: ccmp w4, w5, #0, ls +; CHECK-NEXT: ccmp w6, w7, #4, eq +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %9 = icmp ult i32 %0, %1 + %10 = icmp ugt i32 %2, %3 + %11 = select i1 %9, i1 true, i1 %10 + %12 = icmp ne i32 %4, %5 + %13 = select i1 %11, i1 true, i1 %12 + %14 = icmp eq i32 %6, %7 + %15 = select i1 %13, i1 true, i1 %14 + %16 = zext i1 %15 to i32 + ret i32 %16 +} + +; (x0 != 0) || (x1 != 0) +define i32 @true_or2(i32 %0, i32 %1) { +; CHECK-LABEL: true_or2: +; CHECK: // %bb.0: +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %3 = icmp ne i32 %0, 0 + %4 = icmp ne i32 %1, 0 + %5 = select i1 %3, i1 true, i1 %4 + %6 = zext i1 %5 to i32 + ret i32 %6 +} + +; (x0 != 0) || (x1 != 0) || (x2 != 0) +define i32 @true_or3(i32 %0, i32 %1, i32 %2) { +; CHECK-LABEL: true_or3: +; CHECK: // %bb.0: +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: orr w8, w8, w2 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %4 = icmp ne i32 %0, 0 + %5 = icmp ne i32 %1, 0 + %6 = select i1 %4, i1 true, i1 %5 + %7 = icmp ne i32 %2, 0 + %8 = select i1 %6, i1 true, i1 %7 + %9 = zext i1 %8 to i32 + ret i32 %9 +} diff --git a/llvm/test/CodeGen/AArch64/select-with-and-or.ll b/llvm/test/CodeGen/AArch64/select-with-and-or.ll --- a/llvm/test/CodeGen/AArch64/select-with-and-or.ll +++ b/llvm/test/CodeGen/AArch64/select-with-and-or.ll @@ -18,10 +18,8 @@ ; CHECK-LABEL: or: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: cmp w2, w3 -; CHECK-NEXT: cset w9, gt -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: ccmp w2, w3, #0, ne +; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret %a = icmp eq i32 %x, %y %b = icmp sgt i32 %z, %w @@ -46,10 +44,8 @@ ; CHECK-LABEL: or_not: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: cset w8, ne -; CHECK-NEXT: cmp w2, w3 -; CHECK-NEXT: cset w9, gt -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: ccmp w2, w3, #0, eq +; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret %a = icmp eq i32 %x, %y %b = icmp sgt i32 %z, %w diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -4,24 +4,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; AARCH-LABEL: muloti_test: ; AARCH: // %bb.0: // %start -; AARCH-NEXT: umulh x8, x1, x2 -; AARCH-NEXT: mul x9, x3, x0 -; AARCH-NEXT: cmp xzr, x8 -; AARCH-NEXT: umulh x10, x3, x0 -; AARCH-NEXT: cset w8, ne +; AARCH-NEXT: mul x8, x3, x0 +; AARCH-NEXT: umulh x9, x0, x2 +; AARCH-NEXT: madd x8, x1, x2, x8 +; AARCH-NEXT: umulh x10, x1, x2 +; AARCH-NEXT: adds x8, x9, x8 +; AARCH-NEXT: cset w9, hs ; AARCH-NEXT: cmp x1, #0 ; AARCH-NEXT: ccmp x3, #0, #4, ne -; AARCH-NEXT: madd x9, x1, x2, x9 -; AARCH-NEXT: cset w11, ne -; AARCH-NEXT: cmp xzr, x10 -; AARCH-NEXT: umulh x10, x0, x2 -; AARCH-NEXT: orr w8, w11, w8 -; AARCH-NEXT: cset w11, ne +; AARCH-NEXT: mov x1, x8 +; AARCH-NEXT: ccmp xzr, x10, #0, eq +; AARCH-NEXT: umulh x10, x3, x0 ; AARCH-NEXT: mul x0, x0, x2 -; AARCH-NEXT: adds x1, x10, x9 -; AARCH-NEXT: orr w8, w8, w11 -; AARCH-NEXT: cset w9, hs -; AARCH-NEXT: orr w2, w8, w9 +; AARCH-NEXT: ccmp xzr, x10, #0, eq +; AARCH-NEXT: cset w10, ne +; AARCH-NEXT: orr w2, w10, w9 ; AARCH-NEXT: ret start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -322,48 +322,40 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { ; CHECK-LABEL: umulo_v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: umulh x8, x3, x6 -; CHECK-NEXT: mul x10, x7, x2 -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: umulh x8, x7, x2 -; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: mul x8, x7, x2 +; CHECK-NEXT: umulh x9, x2, x6 +; CHECK-NEXT: madd x8, x3, x6, x8 +; CHECK-NEXT: umulh x10, x3, x6 +; CHECK-NEXT: adds x8, x9, x8 +; CHECK-NEXT: umulh x11, x7, x2 +; CHECK-NEXT: cset w9, hs ; CHECK-NEXT: cmp x3, #0 ; CHECK-NEXT: ccmp x7, #0, #4, ne -; CHECK-NEXT: umulh x11, x2, x6 -; CHECK-NEXT: madd x10, x3, x6, x10 -; CHECK-NEXT: umulh x12, x1, x4 -; CHECK-NEXT: cset w13, ne -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: umulh x13, x1, x4 +; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: mul x10, x5, x0 +; CHECK-NEXT: madd x10, x1, x4, x10 +; CHECK-NEXT: ccmp xzr, x11, #0, eq +; CHECK-NEXT: umulh x11, x0, x4 +; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: cset w11, hs -; CHECK-NEXT: cmp xzr, x12 -; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x5, #0, #4, ne -; CHECK-NEXT: mul x15, x5, x0 -; CHECK-NEXT: umulh x14, x5, x0 -; CHECK-NEXT: orr w9, w13, w9 -; CHECK-NEXT: umulh x16, x0, x4 -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: madd x15, x1, x4, x15 -; CHECK-NEXT: cset w17, ne -; CHECK-NEXT: cmp xzr, x14 -; CHECK-NEXT: orr w12, w17, w12 -; CHECK-NEXT: cset w14, ne -; CHECK-NEXT: adds x15, x16, x15 -; CHECK-NEXT: orr w12, w12, w14 -; CHECK-NEXT: cset w14, hs -; CHECK-NEXT: orr w12, w12, w14 -; CHECK-NEXT: orr w8, w8, w11 -; CHECK-NEXT: mul x11, x0, x4 -; CHECK-NEXT: ldr x9, [sp] -; CHECK-NEXT: fmov s0, w12 -; CHECK-NEXT: stp x11, x15, [x9] -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mul x8, x2, x6 +; CHECK-NEXT: orr w9, w12, w9 +; CHECK-NEXT: mul x12, x0, x4 +; CHECK-NEXT: ccmp xzr, x13, #0, eq +; CHECK-NEXT: umulh x13, x5, x0 +; CHECK-NEXT: ccmp xzr, x13, #0, eq +; CHECK-NEXT: cset w13, ne +; CHECK-NEXT: orr w11, w13, w11 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: ldr x11, [sp] +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mul x9, x2, x6 +; CHECK-NEXT: stp x12, x10, [x11] ; CHECK-NEXT: shl v0.2s, v0.2s, #31 -; CHECK-NEXT: stp x8, x10, [x9, #16] +; CHECK-NEXT: stp x9, x8, [x11, #16] ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 ; CHECK-NEXT: ret %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)