diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2800,6 +2800,54 @@ return SDValue(); } +/** + * If we are facing some sort of diamond carry in/out pattern try to + * break it up to generate: + * (addcarry A, B, In):Out + * + * Patterns typically look something like + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (uaddo *, In) + * | / + * \ Carry + * | / + * Out = (or *, *) + * + * Our goal is to identify A, B, and In and produce ADDCARRY/SUBCARRY with a + * single path for carry out propagation. + */ +static SDValue combineCarryDiamond(SelectionDAG &DAG, SDValue Carry0, + SDValue Carry1, SDNode *N) { + if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) + return SDValue(); + unsigned Opcode = Carry0.getOpcode(); + if (Opcode != Carry1.getOpcode()) + return SDValue(); + if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) + return SDValue(); + if (Carry0.getOperand(0) != Carry1.getValue(0)) { + if (Carry1.getOperand(0) != Carry0.getValue(0)) + return SDValue(); + // Canonicalize the carry in as Carry0 and the addition as Carry1. + std::swap(Carry0, Carry1); + } + if (!Carry1->hasNUsesOfValue(1, 0) || !Carry1->hasNUsesOfValue(1, 1)) + return SDValue(); + if (Carry0.getOperand(1).getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (Carry0.getOperand(1).getOperand(0).getValueType() != MVT::i1) + return SDValue(); + + SDValue Merged = + DAG.getNode(Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY, + SDLoc(N), Carry0->getVTList(), Carry1.getOperand(0), + Carry1.getOperand(1), Carry0.getOperand(1).getOperand(0)); + return Merged.getValue(1); +} + SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. @@ -5787,6 +5835,9 @@ if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; + if (SDValue Combined = combineCarryDiamond(DAG, N0, N1, N)) + return Combined; + // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; @@ -7051,6 +7102,9 @@ if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue Combined = combineCarryDiamond(DAG, N0, N1, N)) + return Combined; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -511,40 +511,13 @@ define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: orb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -594,40 +567,13 @@ define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_xor: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbx, -16 -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: xorb %r10b, %al -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %esi -; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: xorb %r10b, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rbx, 16(%rdi) -; CHECK-NEXT: movq %rsi, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: xorb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -677,31 +623,11 @@ define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_without_i128_or_no_ret: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: orb %r11b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rsi, 16(%rdi) -; CHECK-NEXT: movq %rcx, 24(%rdi) -; CHECK-NEXT: movq %rax, 32(%rdi) +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -747,34 +673,12 @@ define i32 @add_U320_uaddo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: add_U320_uaddo: ; CHECK: # %bb.0: -; CHECK-NEXT: addq 8(%rdi), %rdx -; CHECK-NEXT: setb %r10b ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 16(%rdi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 24(%rdi), %r8 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: addq 32(%rdi), %r9 -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: addq %rsi, %r9 +; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) +; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %r10b, %al -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rcx, 16(%rdi) -; CHECK-NEXT: movq %r8, 24(%rdi) -; CHECK-NEXT: movq %r9, 32(%rdi) ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 @@ -838,22 +742,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: movq (%rdx), %r8 -; CHECK-NEXT: leaq (%rcx,%r8), %rdi -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: addq 8(%rdx), %rdi -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r9b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: addq 16(%rdx), %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: addq (%rdx), %rcx +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: adcq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8 diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/test/CodeGen/X86/subcarry.ll @@ -192,51 +192,13 @@ define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -286,51 +248,13 @@ define i32 @sub_U320_usubo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) { ; CHECK-LABEL: sub_U320_usubo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq 8(%rdi), %r14 -; CHECK-NEXT: movq 16(%rdi), %r10 -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: movq 32(%rdi), %rbx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: subq %rsi, (%rdi) +; CHECK-NEXT: sbbq %rdx, 8(%rdi) +; CHECK-NEXT: sbbq %rcx, 16(%rdi) +; CHECK-NEXT: sbbq %r8, 24(%rdi) +; CHECK-NEXT: sbbq %r9, 32(%rdi) ; CHECK-NEXT: setb %al -; CHECK-NEXT: subq %rdx, %r14 -; CHECK-NEXT: setb %dl -; CHECK-NEXT: subq %rax, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %dl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %rcx, %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r10 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r8, %r11 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %r11 -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: subq %r9, %rbx -; CHECK-NEXT: setb %cl -; CHECK-NEXT: subq %rax, %rbx -; CHECK-NEXT: setb %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: movq %r14, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %r11, 24(%rdi) -; CHECK-NEXT: movq %rbx, 32(%rdi) -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0 %8 = load i64, i64* %7, align 8 @@ -393,22 +317,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: xorl %r9d, %r9d ; CHECK-NEXT: subq (%rdx), %rcx -; CHECK-NEXT: setb %r9b ; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: subq 8(%rdx), %rdi -; CHECK-NEXT: setb %r8b -; CHECK-NEXT: subq %r9, %rdi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: orb %r8b, %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq 16(%rsi), %rsi -; CHECK-NEXT: subq 16(%rdx), %rsi -; CHECK-NEXT: subq %rcx, %rsi -; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: movq 8(%rsi), %rcx +; CHECK-NEXT: sbbq 8(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: sbbq 16(%rdx), %rcx +; CHECK-NEXT: movq %rcx, 16(%rdi) ; CHECK-NEXT: retq %4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0 %5 = load i64, i64* %4, align 8