diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -365,6 +365,8 @@ bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); + SDValue combineSRXToOverflow(SDNode *N); + /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed /// load. /// @@ -8738,6 +8740,9 @@ if (SDValue NewSRA = visitShiftByConstant(N)) return NewSRA; + if (SDValue Overflow = combineSRXToOverflow(N)) + return Overflow; + // Try to transform this shift into a multiply-high if // it matches the appropriate pattern detected in combineShiftToMULH. if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) @@ -8746,6 +8751,74 @@ return SDValue(); } +// Tries to perform (srX (add (zext a, i2^n), (zext b, i2^n)), 2^(n-1)) -> +// (uaddo a, b).overflow where srX can be sra or srl; a and b has type +// i2^(n-1). +SDValue DAGCombiner::combineSRXToOverflow(SDNode *N) { + assert(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (Level >= AfterLegalizeVectorOps || N0.getOpcode() != ISD::ADD) + return SDValue(); + + ConstantSDNode *N1C = dyn_cast(N1); + if (!N1C) + return SDValue(); + + if (!N1C->getAPIntValue().isPowerOf2()) + return SDValue(); + + SDValue N0LHS = N0.getOperand(0); + SDValue N0RHS = N0.getOperand(1); + unsigned ShiftAmt = N1C->getZExtValue(); + + // Make sure that `N0`'s operands were zero-extended from an int type of + // `ShiftAmt` width. + if (N0LHS.getOpcode() != ISD::ZERO_EXTEND || + N0RHS.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + + SDValue N0NoZextLHS = N0LHS.getOperand(0); + SDValue N0NoZextRHS = N0RHS.getOperand(0); + + if ((ShiftAmt != N0NoZextLHS.getValueType().getScalarSizeInBits()) || + (ShiftAmt != N0NoZextRHS.getValueType().getScalarSizeInBits())) + return SDValue(); + + if ((N0.getValueType().getScalarSizeInBits() / 2) != ShiftAmt) + return SDValue(); + + SmallVector Truncs; + // Make sure that `N0` is only used by `N` and `ShiftAmt`-truncates. + for (SDNode *U : N0->uses()) { + if (U == N) + continue; + + if (U->getOpcode() == ISD::TRUNCATE && + U->getValueType(0).getScalarSizeInBits() == ShiftAmt) { + Truncs.push_back(U); + continue; + } + + return SDValue(); + } + + SDValue UAddO = DAG.getNode( + ISD::UADDO, SDLoc(N0), DAG.getVTList(N0NoZextLHS.getValueType(), MVT::i1), + N0NoZextLHS, N0NoZextRHS); + + // Replace the `N0` truncate uses with `UAddO` since `UAddO` performs the + // truncated version of the addition performed by `N0`. + for (SDNode *Trunc : Truncs) { + DAG.ReplaceAllUsesWith(SDValue(Trunc, 0), UAddO.getValue(0)); + } + + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), UAddO.getValue(0)); + return DAG.getZExtOrTrunc(UAddO.getValue(1), SDLoc(N), N->getValueType(0)); +} + SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8968,6 +9041,9 @@ } } + if (SDValue Overflow = combineSRXToOverflow(N)) + return Overflow; + // Try to transform this shift into a multiply-high if // it matches the appropriate pattern detected in combineShiftToMULH. if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) diff --git a/llvm/test/CodeGen/AMDGPU/combine-srx-add.ll b/llvm/test/CodeGen/AMDGPU/combine-srx-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-srx-add.ll @@ -0,0 +1,109 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}basicLshr: +; CHECK: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +define i64 @basicLshr(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +; CHECK-LABEL: {{^}}basicAshr: +; CHECK: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +define i64 @basicAshr(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = ashr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +; CHECK-LABEL: {{^}}truncUse: +; CHECK: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v0, vcc +define i32 @truncUse(i32 %a, i32 %b) { +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %add.a.b.trunc = trunc i64 %add.a.b to i32 + %shr = lshr i64 %add.a.b, 32 + %shr.trunc = trunc i64 %shr to i32 + %ret = add i32 %add.a.b.trunc, %shr.trunc + ret i32 %ret +} + +; CHECK-LABEL: {{^}}sext: +; CHECK: v_ashrrev_i32_e32 [[ashrV0:v[0-9]+]], 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 [[ashrV1:v[0-9]+]], 31, v1 +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_addc_co_u32_e32 v0, vcc, [[ashrV0]], [[ashrV1]], vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +define i64 @sext(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = sext i32 %a to i64 + %b.zext = sext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +; CHECK-LABEL: {{^}}sextAndZext: +; CHECK: v_ashrrev_i32_e32 [[ashrV0:v[0-9]+]], 31, v0 +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_addc_co_u32_e32 v0, vcc, 0, [[ashrV0]], vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +define i64 @sextAndZext(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = sext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +; CHECK-LABEL: {{^}}lshr31: +; CHECK: v_lshrrev_b64 v[0:1], 31, v[0:1] +define i64 @lshr31(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 31 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +; CHECK-LABEL: {{^}}badUse: +; CHECK: v_add_co_u32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_addc_co_u32_e64 v1, {{.+}}, 0, 0, vcc +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v2, v1 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +define i64 @badUse(i32 %a, i32 %b, i64 %c) { +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + %ret = add i64 %add.c.shr, %add.a.b + ret i64 %ret +} diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -155,9 +155,11 @@ define void @d(i8* nocapture %r, i64 %a, i64 %b, i8 %c) nounwind { ; CHECK-LABEL: d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq %rdx, %rsi -; CHECK-NEXT: adcb $0, %cl -; CHECK-NEXT: movb %cl, (%rdi) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: setb %al +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movb %al, (%rdi) ; CHECK-NEXT: retq entry: %0 = zext i64 %a to i128 @@ -195,6 +197,40 @@ ret i8 %8 } +define i64 @f(i32 %a, i32 %b, i64 %c) { +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: adcq $0, %rax +; CHECK-NEXT: retq +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %shr = lshr i64 %add.a.b, 32 + %add.c.shr = add i64 %c, %shr + ret i64 %add.c.shr +} + +define i32 @g(i32 %a, i32 %b) { +; CHECK-LABEL: g: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: retq +entry: + %a.zext = zext i32 %a to i64 + %b.zext = zext i32 %b to i64 + %add.a.b = add i64 %a.zext, %b.zext + %add.a.b.trunc = trunc i64 %add.a.b to i32 + %shr = lshr i64 %add.a.b, 32 + %shr.trunc = trunc i64 %shr to i32 + %ret = add i32 %add.a.b.trunc, %shr.trunc + ret i32 %ret +} + %scalar = type { [4 x i64] } define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {