diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2671,12 +2671,22 @@ // equivalent to (add x, c). // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is // equivalent to (add x, c). + // Do this optimization only when adding c does not introduce instructions + // for adding carries. auto ReassociateAddOr = [&](SDValue N0, SDValue N1) { if (isADDLike(N0, DAG) && N0.hasOneUse() && isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { - return DAG.getNode(ISD::ADD, DL, VT, - DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), - N0.getOperand(1)); + // If N0's type does not split or is a sign mask, it does not introduce + // add carry. + auto TyActn = TLI.getTypeAction(*DAG.getContext(), N0.getValueType()); + bool NoAddCarry = TyActn == TargetLoweringBase::TypeLegal || + TyActn == TargetLoweringBase::TypePromoteInteger || + isMinSignedConstant(N0.getOperand(1)); + if (NoAddCarry) + return DAG.getNode( + ISD::ADD, DL, VT, + DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), + N0.getOperand(1)); } return SDValue(); }; diff --git a/llvm/test/CodeGen/AArch64/add-i256.ll b/llvm/test/CodeGen/AArch64/add-i256.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/add-i256.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=neoverse-n1 < %s | FileCheck %s +target triple = "aarch64-linux-unknown" + +define void @add_i256(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %y1, i64 %y2, i64 %y3, i8* %store_addr_ptr) { +; CHECK-LABEL: add_i256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adds x8, x0, #1 +; CHECK-NEXT: adcs x9, x1, x4 +; CHECK-NEXT: stp x8, x9, [x7] +; CHECK-NEXT: adcs x8, x2, x5 +; CHECK-NEXT: adc x9, x3, x6 +; CHECK-NEXT: stp x8, x9, [x7, #16] +; CHECK-NEXT: ret +entry: + ; Build x_256 = x0 | x1 << 64 | x2 << 128 | x3 << 192 + %temp = zext i64 %x0 to i256 + %temp57 = zext i64 %x1 to i256 + %temp58 = zext i64 %x2 to i256 + %temp59 = zext i64 %x3 to i256 + %temp_shifted = shl i256 %temp, 0 + %temp_shifted60 = shl i256 %temp57, 64 + %temp_shifted61 = shl i256 %temp58, 128 + %temp_shifted62 = shl i256 %temp59, 192 + %x = or i256 %temp_shifted, %temp_shifted60 + %x63 = or i256 %x, %temp_shifted61 + %x_big = or i256 %x63, %temp_shifted62 + + ; Build y_256 = 1 | y1 << 64 | y2 << 128 | y3 << 192 + %temp65 = zext i64 %y1 to i256 + %temp66 = zext i64 %y2 to i256 + %temp67 = zext i64 %y3 to i256 + %temp_shifted68 = shl i256 %temp65, 64 + %temp_shifted69 = shl i256 %temp66, 128 + %temp_shifted70 = shl i256 %temp67, 192 + %y = or i256 1, %temp_shifted68 + %y71 = or i256 %y, %temp_shifted69 + %y_big = or i256 %y71, %temp_shifted70 + + ; z_256 = x_256 + y_256 + %z_256 = add i256 %x_big, %y_big + + ; split z_256 into 4 64-bit registers + %split_64bits = lshr i256 %z_256, 0 + %z0 = trunc i256 %split_64bits to i64 + %split_64bits74 = lshr i256 %z_256, 64 + %z1 = trunc i256 %split_64bits74 to i64 + %split_64bits76 = lshr i256 %z_256, 128 + %z2 = trunc i256 %split_64bits76 to i64 + %split_64bits78 = lshr i256 %z_256, 192 + %z3 = trunc i256 %split_64bits78 to i64 + + %outptr0 = bitcast i8* %store_addr_ptr to i64* + store i64 %z0, i64* %outptr0, align 4 + %gep = getelementptr i8, i8* %store_addr_ptr, i64 8 + %outptr1 = bitcast i8* %gep to i64* + store i64 %z1, i64* %outptr1, align 4 + %store_addr_ofs = getelementptr i8, i8* %store_addr_ptr, i64 16 + %outptr081 = bitcast i8* %store_addr_ofs to i64* + store i64 %z2, i64* %outptr081, align 4 + %gep82 = getelementptr i8, i8* %store_addr_ofs, i64 8 + %outptr183 = bitcast i8* %gep82 to i64* + store i64 %z3, i64* %outptr183, align 4 + ret void +}