Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -41,6 +41,7 @@ #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" @@ -9310,11 +9311,33 @@ if (StVal.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); - // If the zero constant has more than one use then the vector store could be - // better since the constant mov will be amortized and stp q instructions - // should be able to be formed. - if (!StVal.hasOneUse()) - return SDValue(); + // If the zero constant has more than one use then the vector store + // could be better since the constant mov will be amortized and stp + // q instructions should be able to be formed. The exception to this + // is when all uses of StVal are non-consecutive ST16 instances. In + // this case, it is better to replace each instance with stp xzr, xzr. + + if (!StVal.hasOneUse()) { + if (VT.getVectorElementType().getSizeInBits() != 64) + return SDValue(); + SmallVector STAddrs; + for (auto *U : StVal->uses()) { + if ((U->getOpcode() != ISD::STORE) || + (U->getOperand(1).getValueType().getSizeInBits() != 16 * 8)) + return SDValue(); + SDValue Addr = U->getOperand(2); + auto AddrDecomp = BaseIndexOffset::match(Addr, DAG); + for (SDValue &OtherAddr : STAddrs) { + auto OtherAddrDecomp = BaseIndexOffset::match(OtherAddr, DAG); + int64_t Offset = 0; + AddrDecomp.equalBaseIndex(OtherAddrDecomp, DAG, Offset); + if (Offset == 16 || Offset == -16) + return SDValue(); + } + + STAddrs.push_back(Addr); + } + } // If the immediate offset of the address operand is too large for the stp // instruction, then bail out. Index: test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- test/CodeGen/AArch64/ldst-opt.ll +++ test/CodeGen/AArch64/ldst-opt.ll @@ -1613,13 +1613,9 @@ define void @merge_multiple_128bit_stores(i64* %p) { ; CHECK-LABEL: merge_multiple_128bit_stores ; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 -; NOSTRICTALIGN-NEXT: str q0, [x0] -; NOSTRICTALIGN-NEXT: stur q0, [x0, #24] -; NOSTRICTALIGN-NEXT: str q0, [x0, #48] -; STRICTALIGN-NEXT: stp xzr, xzr, [x0] -; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24] -; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #24] +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #48] ; CHECK-NEXT: ret entry: store i64 0, i64* %p