Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9247,6 +9247,41 @@ return NewST1; } +// Check if addresses X and Y differ by 16 in one direction or another. +static bool BaseOffsetDifference16(SDValue X, SDValue Y) { + int64_t PartialOffset = 0; + if (X.getOpcode() == ISD::SIGN_EXTEND) + X = X->getOperand(0); + if (Y.getOpcode() == ISD::SIGN_EXTEND) + Y = Y->getOperand(0); + if (X.getOpcode() == ISD::ADD && isa(X->getOperand(1))) { + PartialOffset -= cast(X->getOperand(1))->getSExtValue(); + X = X->getOperand(0); + } + if (Y.getOpcode() == ISD::ADD && isa(Y->getOperand(1))) { + PartialOffset += cast(Y->getOperand(1))->getSExtValue(); + Y = Y->getOperand(0); + } + while (X.getOpcode() == ISD::ADD && Y.getOpcode() == ISD::ADD) { + if (X->getOperand(0) == Y->getOperand(0)) { + X = X->getOperand(1); + Y = Y->getOperand(1); + } else if (X->getOperand(1) == Y->getOperand(1)) { + X = X->getOperand(0); + Y = Y->getOperand(0); + } else + break; + } + if (isa(X) && isa(Y)) { + PartialOffset -= cast(X)->getSExtValue(); + PartialOffset += cast(Y)->getSExtValue(); + return (PartialOffset == 16) || (PartialOffset == -16); + } + if (X == Y) + return (PartialOffset == 16) || (PartialOffset == -16); + return false; +} + /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store @@ -9278,11 +9313,27 @@ if (StVal.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); - // If the zero constant has more than one use then the vector store could be - // better since the constant mov will be amortized and stp q instructions - // should be able to be formed. - if (!StVal.hasOneUse()) - return SDValue(); + // If the zero constant has more than one use then the vector store + // could be better since the constant mov will be amortized and stp + // q instructions should be able to be formed. The exception to this + // is when all uses of StVal are non-consecutive ST16 instances. In + // this case, it is better to replace each instance with stp xzr, xzr. + + if (!StVal.hasOneUse()) { + if (VT.getVectorElementType().getSizeInBits() != 64) + return SDValue(); + SmallVector STAddrs; + for (auto *U : StVal->uses()) { + if ((U->getOpcode() != ISD::STORE) || + (U->getOperand(1).getValueType().getSizeInBits() != 16 * 8)) + return SDValue(); + SDValue Addr = U->getOperand(2); + for (SDValue &OtherAddr : STAddrs) + if (BaseOffsetDifference16(Addr, OtherAddr)) + return SDValue(); + STAddrs.push_back(Addr); + } + } // If the immediate offset of the address operand is too large for the stp // instruction, then bail out. Index: test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- test/CodeGen/AArch64/ldst-opt.ll +++ test/CodeGen/AArch64/ldst-opt.ll @@ -1608,3 +1608,59 @@ store <4 x double> zeroinitializer, <4 x double>* %p ret void } + +; Verify that non-consecutive merges do not generate q0 +define void @merge_multiple_128bit_stores(i64* %p) { +; CHECK-LABEL: merge_multiple_128bit_stores +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #24] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #48] +; CHECK-NEXT: ret +entry: + store i64 0, i64* %p + %p1 = getelementptr i64, i64* %p, i64 1 + store i64 0, i64* %p1 + %p3 = getelementptr i64, i64* %p, i64 3 + store i64 0, i64* %p3 + %p4 = getelementptr i64, i64* %p, i64 4 + store i64 0, i64* %p4 + %p6 = getelementptr i64, i64* %p, i64 6 + store i64 0, i64* %p6 + %p7 = getelementptr i64, i64* %p, i64 7 + store i64 0, i64* %p7 + ret void +} + +; Verify that large stores generate stp q +define void @merge_multiple_128bit_stores_consec(i64* %p) { +; CHECK-LABEL: merge_multiple_128bit_stores_consec +; CHECK: // %entry +; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}, #32] +; STRICTALIGN-NEXT: stp xzr, xzr, [x0] +; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #16] +; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #32] +; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; CHECK-NEXT: ret +entry: + store i64 0, i64* %p + %p1 = getelementptr i64, i64* %p, i64 1 + store i64 0, i64* %p1 + %p2 = getelementptr i64, i64* %p, i64 2 + store i64 0, i64* %p2 + %p3 = getelementptr i64, i64* %p, i64 3 + store i64 0, i64* %p3 + %p4 = getelementptr i64, i64* %p, i64 4 + store i64 0, i64* %p4 + %p5 = getelementptr i64, i64* %p, i64 5 + store i64 0, i64* %p5 + %p6 = getelementptr i64, i64* %p, i64 6 + store i64 0, i64* %p6 + %p7 = getelementptr i64, i64* %p, i64 7 + store i64 0, i64* %p7 + ret void +} + +