diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2802,6 +2802,95 @@ return SDValue(); } +// If we are facing some sort of diamond carry/borrow in/out pattern try to +// match patterns like: +// +// (uaddo A, B) CarryIn +// | \ | +// | \ | +// PartialSum PartialCarryOutX / +// | | / +// | ____|____________/ +// | / | +// (uaddo *, *) \________ +// | \ \ +// | \ | +// | PartialCarryOutY | +// | \ | +// | \ / +// AddCarrySum | ______/ +// | / +// CarryOut = (or *, *) +// +// And generate ADDCARRY (or SUBCARRY) with two result values: +// +// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) +// +// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with +// a single path for carry/borrow out propagation: +static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, + const TargetLowering &TLI, SDValue Carry0, + SDValue Carry1, SDNode *N) { + if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) + return SDValue(); + unsigned Opcode = Carry0.getOpcode(); + if (Opcode != Carry1.getOpcode()) + return SDValue(); + if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) + return SDValue(); + + // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the + // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in + // the above ASCII art.) + if (Carry1.getOperand(0) != Carry0.getValue(0) && + Carry1.getOperand(1) != Carry0.getValue(0)) + std::swap(Carry0, Carry1); + if (Carry1.getOperand(0) != Carry0.getValue(0) && + Carry1.getOperand(1) != Carry0.getValue(0)) + return SDValue(); + + // The carry in value must be on the righthand side for subtraction. + unsigned CarryInOperandNum = + Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0; + if (Opcode == ISD::USUBO && CarryInOperandNum != 1) + return SDValue(); + SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); + + unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) + return SDValue(); + + // Verify that the carry/borrow in is plausibly a carry/borrow bit. + // TODO: make getAsCarry() aware of how partial carries are merged. + if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (CarryIn.getOperand(0).getValueType() != MVT::i1) + return SDValue(); + + SDLoc DL(N); + SDValue Merged = + DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0), + Carry0.getOperand(1), CarryIn.getOperand(0)); + + // Please note that because we have proven that the result of the UADDO/USUBO + // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can + // therefore prove that if the first UADDO/USUBO overflows, the second + // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the + // maximum value. + // + // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry + // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) + // + // This is important because it means that OR and XOR can be used to merge + // carry flags; and that AND can return a constant zero. + // + // TODO: match other operations that can merge flags (ADD, etc) + DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); + if (N->getOpcode() == ISD::AND) + return DAG.getConstant(0, DL, MVT::i1); + return Merged.getValue(1); +} + SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. @@ -5093,6 +5182,9 @@ if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -5787,6 +5879,9 @@ if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; @@ -7049,6 +7144,9 @@ if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -511,40 +511,13 @@ define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: orb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -594,40 +567,13 @@ define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_xor: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: xorb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: xorb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: xorb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -674,34 +620,71 @@ ret i32 %43 } +; Either the primary addition can overflow or the addition of the carry, but +; they cannot both overflow. +define i32 @bogus_add_U320_without_i128_and(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { +; CHECK-LABEL: bogus_add_U320_without_i128_and: +; CHECK: # %bb.0: +; CHECK-NEXT: addq %rsi, (%rdi) +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: addq %rcx, 16(%rdi) +; CHECK-NEXT: addq %r8, 24(%rdi) +; CHECK-NEXT: addq %r9, 32(%rdi) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 1 + %10 = load i64, i64* %9, align 8 + %11 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 2 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 3 + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 4 + %16 = load i64, i64* %15, align 8 + %17 = add i64 %8, %1 + %18 = add i64 %10, %2 + %19 = icmp ult i64 %17, %1 + %20 = zext i1 %19 to i64 + %21 = add i64 %18, %20 + %22 = add i64 %12, %3 + %23 = icmp ult i64 %18, %10 + %24 = icmp ult i64 %21, %18 + %25 = and i1 %23, %24 + %26 = zext i1 %25 to i64 + %27 = add i64 %22, %26 + %28 = add i64 %14, %4 + %29 = icmp ult i64 %22, %12 + %30 = icmp ult i64 %27, %22 + %31 = and i1 %29, %30 + %32 = zext i1 %31 to i64 + %33 = add i64 %28, %32 + %34 = add i64 %16, %5 + %35 = icmp ult i64 %28, %14 + %36 = icmp ult i64 %33, %28 + %37 = and i1 %35, %36 + %38 = zext i1 %37 to i64 + %39 = add i64 %34, %38 + store i64 %17, i64* %7, align 8 + store i64 %21, i64* %9, align 8 + store i64 %27, i64* %11, align 8 + store i64 %33, i64* %13, align 8 + store i64 %39, i64* %15, align 8 + %40 = icmp ult i64 %34, %16 + %41 = icmp ult i64 %39, %34 + %42 = and i1 %40, %41 + %43 = zext i1 %42 to i32 + ret i32 %43 +} + define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or_no_ret: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rsi, 16(%rdi) -; CHECK-NEXT: movq %rcx, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -747,34 +730,12 @@ define i32 @add_U320_uaddo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_uaddo: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r9 +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rcx, 16(%rdi) -; CHECK-NEXT: movq %r8, 24(%rdi) -; CHECK-NEXT: movq %r9, 32(%rdi) ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 @@ -838,22 +799,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: movq (%rdx), %r8 -; CHECK-NEXT: leaq (%rcx,%r8), %rdi -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: addq 8(%rdx), %rdi -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r9b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: addq 16(%rdx), %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: addq (%rdx), %rcx +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: adcq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8 @@ -896,12 +849,9 @@ define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, %uint128* nocapture %4) nounwind { ; CHECK-LABEL: uaddo_U128_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: setb %cl ; CHECK-NEXT: addq %rdx, %rdi -; CHECK-NEXT: adcq $0, %rsi +; CHECK-NEXT: adcq %rcx, %rsi ; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movq %rsi, (%r8) ; CHECK-NEXT: movq %rdi, 8(%r8) ; CHECK-NEXT: retq @@ -927,18 +877,12 @@ ; CHECK-LABEL: add_U192_without_i128_or: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: addq %r9, %rdx -; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %r8, %rsi -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: orb %dil, %r8b -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movzbl %r8b, %edi -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq %rdx, 8(%rax) -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: adcq %r9, %rdx +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %rsi, 16(%rdi) ; CHECK-NEXT: retq %8 = add i64 %4, %1 %9 = icmp ult i64 %8, %1 @@ -969,29 +913,18 @@ ; CHECK-LABEL: add_U256_without_i128_or_by_i64_words: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r9 -; CHECK-NEXT: movq 8(%rdx), %r10 -; CHECK-NEXT: addq 8(%rsi), %r10 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: addq (%rsi), %r9 -; CHECK-NEXT: adcq $0, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movq 16(%rdx), %rdi -; CHECK-NEXT: addq 16(%rsi), %rdi -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: movzbl %cl, %r11d -; CHECK-NEXT: addq %rdi, %r11 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl +; CHECK-NEXT: movq (%rdx), %r8 +; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: addq (%rsi), %r8 +; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rdx), %rcx +; CHECK-NEXT: adcq 16(%rsi), %rcx ; CHECK-NEXT: movq 24(%rdx), %rdx -; CHECK-NEXT: addq 24(%rsi), %rdx -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %rdx, %rcx -; CHECK-NEXT: movq %rcx, (%rax) -; CHECK-NEXT: movq %r11, 8(%rax) -; CHECK-NEXT: movq %r10, 16(%rax) -; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: movq %rcx, 8(%rax) +; CHECK-NEXT: movq %rdi, 16(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) ; CHECK-NEXT: retq %4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0 %5 = load i64, i64* %4, align 8 @@ -1043,24 +976,15 @@ ; CHECK-LABEL: add_U256_without_i128_or_recursive: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r9 +; CHECK-NEXT: movq (%rdx), %r8 ; CHECK-NEXT: movq 8(%rdx), %rdi -; CHECK-NEXT: addq 8(%rsi), %rdi -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: addq (%rsi), %r9 -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movq 16(%rdx), %r8 -; CHECK-NEXT: movq 24(%rdx), %r10 -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: addq 16(%rsi), %r8 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: addq 24(%rsi), %r10 -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: adcq %r10, %rdx -; CHECK-NEXT: movq %r9, (%rax) +; CHECK-NEXT: addq (%rsi), %r8 +; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rdx), %rcx +; CHECK-NEXT: movq 24(%rdx), %rdx +; CHECK-NEXT: adcq 16(%rsi), %rcx +; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: movq %r8, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/test/CodeGen/X86/subcarry.ll @@ -192,51 +192,13 @@ define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -286,51 +248,13 @@ define i32 @sub_U320_usubo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_usubo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %dl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -393,22 +317,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: xorl %r9d, %r9d ; CHECK-NEXT: subq (%rdx), %rcx -; CHECK-NEXT: setb %r9b ; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: subq 8(%rdx), %rdi -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: subq %r9, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: subq 16(%rdx), %rsi -; CHECK-NEXT: subq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: sbbq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: sbbq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8 @@ -454,28 +370,23 @@ ; CHECK-LABEL: sub_U256_without_i128_or_recursive: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rsi), %r8 +; CHECK-NEXT: movq (%rsi), %r9 ; CHECK-NEXT: movq 8(%rsi), %r10 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: subq (%rdx), %r8 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq 8(%rdx), %r10 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r9b, %cl -; CHECK-NEXT: movq 16(%rsi), %rdi +; CHECK-NEXT: subq (%rdx), %r9 +; CHECK-NEXT: sbbq 8(%rdx), %r10 +; CHECK-NEXT: setb %r8b +; CHECK-NEXT: movq 16(%rsi), %rcx ; CHECK-NEXT: movq 24(%rsi), %rsi -; CHECK-NEXT: xorl %r9d, %r9d -; CHECK-NEXT: subq 16(%rdx), %rdi -; CHECK-NEXT: setb %r9b +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: subq 16(%rdx), %rcx +; CHECK-NEXT: setb %dil ; CHECK-NEXT: subq 24(%rdx), %rsi -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: subq %rcx, %rdi -; CHECK-NEXT: sbbq %r9, %rsi -; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: movzbl %r8b, %edx +; CHECK-NEXT: subq %rdx, %rcx +; CHECK-NEXT: sbbq %rdi, %rsi +; CHECK-NEXT: movq %r9, (%rax) ; CHECK-NEXT: movq %r10, 8(%rax) -; CHECK-NEXT: movq %rdi, 16(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq %rsi, 24(%rax) ; CHECK-NEXT: retq %4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0