diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2800,6 +2800,80 @@ return SDValue(); } +// If we are facing some sort of diamond carry/borrow in/out pattern try to +// break it up to generate: (addcarry A, B, In):Out +// +// Patterns typically look something like; +// +// (uaddo A, B) +// / \ +// Carry Sum +// | \ +// | (uaddo *, In) +// | / +// \ Carry +// | / +// Out = (or *, *) +// +// Our goal is to identify A, B, and In and produce ADDCARRY/SUBCARRY with a +// single path for carry/borrow out propagation. +// +// Please note that because we have proven that the result of the UADDO/USUBO +// of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can +// therefore prove that either the first UADDO/USUBO overflows, or the second +// does, but not both. For example consider 8-bit numbers where 0xFF is the +// maximum value. +// +// 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry +// 0xF0 + 0x0F == 0xFF does not carry but 0xFF + 1 does carry +// 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) +// 0xFF - 0xFF == 0 does not carry/borrow but 0 - 1 == 0xFF does carry/borrow +// +// This is important because it means that OR and XOR can be used to merge carry +// flags; and that AND can return a constant zero. +// +// TODO: match other operations that can merge flags (ADD, etc) +static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, + SDValue Carry0, SDValue Carry1, SDNode *N) { + if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) + return SDValue(); + unsigned Opcode = Carry0.getOpcode(); + if (Opcode != Carry1.getOpcode()) + return SDValue(); + if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) + return SDValue(); + + // Canonicalize the carry in as Carry0 and the add/sub of A and B as Carry1. + // I.e. the middle and top nodes respectively in the above ASCII art. + if (Carry0.getOperand(0) != Carry1.getValue(0)) + std::swap(Carry0, Carry1); + if (Carry0.getOperand(0) != Carry1.getValue(0)) + return SDValue(); + + unsigned ISDOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + if (!TLI.isOperationLegalOrCustom(ISDOp, Carry1.getValue(0).getValueType())) + return SDValue(); + + // Verify that the add/sub has only one use of its respective results. + if (!Carry1->hasNUsesOfValue(1, 0) || !Carry1->hasNUsesOfValue(1, 1)) + return SDValue(); + + // Verify that the carry/borrow in is plausibly a carry/borrow bit. + if (Carry0.getOperand(1).getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (Carry0.getOperand(1).getOperand(0).getValueType() != MVT::i1) + return SDValue(); + + SDLoc DL(N); + SDValue Merged = DAG.getNode(ISDOp, DL, Carry0->getVTList(), + Carry1.getOperand(0), Carry1.getOperand(1), + Carry0.getOperand(1).getOperand(0)); + if (N->getOpcode() != ISD::AND) + return Merged.getValue(1); + DAG.ReplaceAllUsesOfValueWith(Carry0.getValue(0), Merged.getValue(0)); + return DAG.getConstant(0, DL, MVT::i1); +} + SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. @@ -5092,6 +5166,9 @@ if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) + return Combined; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -5787,6 +5864,9 @@ if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) + return Combined; + // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; @@ -7051,6 +7131,9 @@ if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) + return Combined; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -511,40 +511,13 @@ define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: orb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -591,43 +564,72 @@ ret i32 %43 } +define i32 @add_constant1_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0) { +; CHECK-LABEL: add_constant1_U320_without_i128_or: +; CHECK: # %bb.0: +; CHECK-NEXT: addq $1, (%rdi) +; CHECK-NEXT: adcq $0, 8(%rdi) +; CHECK-NEXT: adcq $0, 16(%rdi) +; CHECK-NEXT: adcq $0, 24(%rdi) +; CHECK-NEXT: adcq $0, 32(%rdi) +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: retq + %2 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 + %3 = load i64, i64* %2, align 8 + %4 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 1 + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 2 + %7 = load i64, i64* %6, align 8 + %8 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 3 + %9 = load i64, i64* %8, align 8 + %10 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 4 + %11 = load i64, i64* %10, align 8 + %12 = add i64 %3, 1 + %13 = add i64 %5, 0 + %14 = icmp ult i64 %12, 1 + %15 = zext i1 %14 to i64 + %16 = add i64 %13, %15 + %17 = add i64 %7, 0 + %18 = icmp ult i64 %13, %5 + %19 = icmp ult i64 %16, %13 + %20 = or i1 %18, %19 + %21 = zext i1 %20 to i64 + %22 = add i64 %17, %21 + %23 = add i64 %9, 0 + %24 = icmp ult i64 %17, %7 + %25 = icmp ult i64 %22, %17 + %26 = or i1 %24, %25 + %27 = zext i1 %26 to i64 + %28 = add i64 %23, %27 + %29 = add i64 %11, 0 + %30 = icmp ult i64 %23, %9 + %31 = icmp ult i64 %28, %23 + %32 = or i1 %30, %31 + %33 = zext i1 %32 to i64 + %34 = add i64 %29, %33 + store i64 %12, i64* %2, align 8 + store i64 %16, i64* %4, align 8 + store i64 %22, i64* %6, align 8 + store i64 %28, i64* %8, align 8 + store i64 %34, i64* %10, align 8 + %35 = icmp ult i64 %29, %11 + %36 = icmp ult i64 %34, %29 + %37 = or i1 %35, %36 + %38 = zext i1 %37 to i32 + ret i32 %38 +} + define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_xor: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: xorb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: xorb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: xorb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -674,34 +676,71 @@ ret i32 %43 } +; Either the primary addition can overflow or the addition of the carry, but +; they cannot both overflow. +define i32 @bogus_add_U320_without_i128_and(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { +; CHECK-LABEL: bogus_add_U320_without_i128_and: +; CHECK: # %bb.0: +; CHECK-NEXT: addq %rsi, (%rdi) +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: addq %rcx, 16(%rdi) +; CHECK-NEXT: addq %r8, 24(%rdi) +; CHECK-NEXT: addq %r9, 32(%rdi) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 1 + %10 = load i64, i64* %9, align 8 + %11 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 2 + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 3 + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 4 + %16 = load i64, i64* %15, align 8 + %17 = add i64 %8, %1 + %18 = add i64 %10, %2 + %19 = icmp ult i64 %17, %1 + %20 = zext i1 %19 to i64 + %21 = add i64 %18, %20 + %22 = add i64 %12, %3 + %23 = icmp ult i64 %18, %10 + %24 = icmp ult i64 %21, %18 + %25 = and i1 %23, %24 + %26 = zext i1 %25 to i64 + %27 = add i64 %22, %26 + %28 = add i64 %14, %4 + %29 = icmp ult i64 %22, %12 + %30 = icmp ult i64 %27, %22 + %31 = and i1 %29, %30 + %32 = zext i1 %31 to i64 + %33 = add i64 %28, %32 + %34 = add i64 %16, %5 + %35 = icmp ult i64 %28, %14 + %36 = icmp ult i64 %33, %28 + %37 = and i1 %35, %36 + %38 = zext i1 %37 to i64 + %39 = add i64 %34, %38 + store i64 %17, i64* %7, align 8 + store i64 %21, i64* %9, align 8 + store i64 %27, i64* %11, align 8 + store i64 %33, i64* %13, align 8 + store i64 %39, i64* %15, align 8 + %40 = icmp ult i64 %34, %16 + %41 = icmp ult i64 %39, %34 + %42 = and i1 %40, %41 + %43 = zext i1 %42 to i32 + ret i32 %43 +} + define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or_no_ret: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rsi, 16(%rdi) -; CHECK-NEXT: movq %rcx, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -747,34 +786,12 @@ define i32 @add_U320_uaddo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_uaddo: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r9 +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rcx, 16(%rdi) -; CHECK-NEXT: movq %r8, 24(%rdi) -; CHECK-NEXT: movq %r9, 32(%rdi) ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 @@ -838,22 +855,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: movq (%rdx), %r8 -; CHECK-NEXT: leaq (%rcx,%r8), %rdi -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: addq 8(%rdx), %rdi -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r9b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: addq 16(%rdx), %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: addq (%rdx), %rcx +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: adcq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8 diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/test/CodeGen/X86/subcarry.ll @@ -192,51 +192,13 @@ define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -286,51 +248,13 @@ define i32 @sub_U320_usubo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_usubo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -393,22 +317,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: xorl %r9d, %r9d ; CHECK-NEXT: subq (%rdx), %rcx -; CHECK-NEXT: setb %r9b ; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: subq 8(%rdx), %rdi -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: subq %r9, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: subq 16(%rdx), %rsi -; CHECK-NEXT: subq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: sbbq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: sbbq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8