Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8799,6 +8799,61 @@ return NewST1; } +/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The +/// load store optimizer pass will merge them to store pair stores. This should +/// be better than a movi to create the vector zero followed by a vector store +/// if the zero constant is not re-used, since one instructions and one register +/// live range will be removed. +/// +/// For example, the final generated code should be: +/// +/// stp xzr, xzr, [x0] +/// +/// instead of: +/// +/// movi v0.2d, #0 +/// str q0, [x0] +/// +static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) { + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + // We can express a splat as store pair(s) for 2 or 4 elements. + int NumVecElts = VT.getVectorNumElements(); + if (NumVecElts != 4 && NumVecElts != 2) + return SDValue(); + + if (StVal.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // If the zero constant has more than one use then the vector store could be + // better since the constant mov will be amortized and stp q instructions + // should be able to be formed. + if (!StVal.hasOneUse()) + return SDValue(); + + // If the immediate offset of the address operand is too large for the stp + // instruction, then bail out. + if (DAG.isBaseWithConstantOffset(St->getBasePtr())) { + int64_t Offset = St->getBasePtr()->getConstantOperandVal(1); + if (Offset < -512 || Offset > 504) + return SDValue(); + } + + for (int I = 0; I < NumVecElts; ++I) { + SDValue EltVal = StVal.getOperand(I); + if (!isa(EltVal) || + !cast(EltVal)->isNullValue()) + return SDValue(); + } + // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from + // undoing this transformation. + return split16BStoreSplat( + DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32) + : DAG.getRegister(AArch64::XZR, MVT::i64), + NumVecElts); +} + /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split @@ -8862,6 +8917,17 @@ if (S->isVolatile()) return SDValue(); + SDValue StVal = S->getValue(); + EVT VT = StVal.getValueType(); + if (!VT.isVector()) + return SDValue(); + + // If we get a splat of zeros, convert this vector store to a store of + // scalars. They will be merged into store pairs of xzr thereby removing one + // instruction and one register. + if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S)) + return ReplacedZeroSplat; + // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. @@ -8873,12 +8939,9 @@ if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); - SDValue StVal = S->getValue(); - EVT VT = StVal.getValueType(); - // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) + if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. Index: llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll @@ -182,22 +182,22 @@ ; CHECK: LD4Fourv2d ; CHECK: STRQui ; CHECK: ********** INTERVALS ********** -define void @testLdStConflict() { +define void @testLdStConflict(<2 x i64> %v) { entry: br label %loop loop: %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null) %ptr = bitcast i8* undef to <2 x i64>* - store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4 + store <2 x i64> %v, <2 x i64>* %ptr, align 4 %ptr1 = bitcast i8* undef to <2 x i64>* - store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4 + store <2 x i64> %v, <2 x i64>* %ptr1, align 4 %ptr2 = bitcast i8* undef to <2 x i64>* - store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4 + store <2 x i64> %v, <2 x i64>* %ptr2, align 4 %ptr3 = bitcast i8* undef to <2 x i64>* - store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4 + store <2 x i64> %v, <2 x i64>* %ptr3, align 4 %ptr4 = bitcast i8* undef to <2 x i64>* - store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4 + store <2 x i64> %v, <2 x i64>* %ptr4, align 4 br label %loop } Index: llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll @@ -1333,3 +1333,134 @@ end: ret void } + +; DAGCombiner::MergeConsecutiveStores merges this into a vector store, +; replaceZeroVectorStore should split the vector store back into +; scalar stores which should get merged by AArch64LoadStoreOptimizer. +define void @merge_zr32(i32* %p) { +; CHECK-LABEL: merge_zr32: +; CHECK: // %entry +; CHECK-NEXT: str xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store i32 0, i32* %p + %p1 = getelementptr i32, i32* %p, i32 1 + store i32 0, i32* %p1 + ret void +} + +; Same sa merge_zr32 but the merged stores should also get paried. +define void @merge_zr32_2(i32* %p) { +; CHECK-LABEL: merge_zr32_2: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store i32 0, i32* %p + %p1 = getelementptr i32, i32* %p, i32 1 + store i32 0, i32* %p1 + %p2 = getelementptr i32, i32* %p, i64 2 + store i32 0, i32* %p2 + %p3 = getelementptr i32, i32* %p, i64 3 + store i32 0, i32* %p3 + ret void +} + +; Like merge_zr32_2, but checking the largest allowed stp immediate offset. +define void @merge_zr32_2_offset(i32* %p) { +; CHECK-LABEL: merge_zr32_2_offset: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504] +; CHECK-NEXT: ret +entry: + %p0 = getelementptr i32, i32* %p, i32 126 + store i32 0, i32* %p0 + %p1 = getelementptr i32, i32* %p, i32 127 + store i32 0, i32* %p1 + %p2 = getelementptr i32, i32* %p, i64 128 + store i32 0, i32* %p2 + %p3 = getelementptr i32, i32* %p, i64 129 + store i32 0, i32* %p3 + ret void +} + +; Like merge_zr32, but replaceZeroVectorStore should not split this +; vector store since the address offset is too large for the stp +; instruction. +define void @no_merge_zr32_2_offset(i32* %p) { +; CHECK-LABEL: no_merge_zr32_2_offset: +; CHECK: // %entry +; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096] +; CHECK-NEXT: ret +entry: + %p0 = getelementptr i32, i32* %p, i32 1024 + store i32 0, i32* %p0 + %p1 = getelementptr i32, i32* %p, i32 1025 + store i32 0, i32* %p1 + %p2 = getelementptr i32, i32* %p, i64 1026 + store i32 0, i32* %p2 + %p3 = getelementptr i32, i32* %p, i64 1027 + store i32 0, i32* %p3 + ret void +} + +; Like merge_zr32, but replaceZeroVectorStore should not split the +; vector store since the zero constant vector has multiple uses, so we +; err on the side that allows for stp q instruction generation. +define void @merge_zr32_3(i32* %p) { +; CHECK-LABEL: merge_zr32_3: +; CHECK: // %entry +; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store i32 0, i32* %p + %p1 = getelementptr i32, i32* %p, i32 1 + store i32 0, i32* %p1 + %p2 = getelementptr i32, i32* %p, i64 2 + store i32 0, i32* %p2 + %p3 = getelementptr i32, i32* %p, i64 3 + store i32 0, i32* %p3 + %p4 = getelementptr i32, i32* %p, i64 4 + store i32 0, i32* %p4 + %p5 = getelementptr i32, i32* %p, i64 5 + store i32 0, i32* %p5 + %p6 = getelementptr i32, i32* %p, i64 6 + store i32 0, i32* %p6 + %p7 = getelementptr i32, i32* %p, i64 7 + store i32 0, i32* %p7 + ret void +} + +; Similar to merge_zr32, but for 64-bit values. +define void @merge_zr64(i64* %p) { +; CHECK-LABEL: merge_zr64: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store i64 0, i64* %p + %p1 = getelementptr i64, i64* %p, i64 1 + store i64 0, i64* %p1 + ret void +} + +; Similar to merge_zr32_3, replaceZeroVectorStore should not split the +; vector store since the zero constant vector has multiple uses. +define void @merge_zr64_2(i64* %p) { +; CHECK-LABEL: merge_zr64_2: +; CHECK: // %entry +; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store i64 0, i64* %p + %p1 = getelementptr i64, i64* %p, i64 1 + store i64 0, i64* %p1 + %p2 = getelementptr i64, i64* %p, i64 2 + store i64 0, i64* %p2 + %p3 = getelementptr i64, i64* %p, i64 3 + store i64 0, i64* %p3 + ret void +} Index: llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -10,11 +10,11 @@ define i32 @main() local_unnamed_addr #1 { ; Make sure the stores happen in the correct order (the exact instructions could change). ; CHECK-LABEL: main: +; CHECK: stp xzr, xzr, [sp, #72] +; CHECK: str w9, [sp, #80] ; CHECK: str q0, [sp, #48] ; CHECK: ldr w8, [sp, #48] -; CHECK: stur q1, [sp, #72] ; CHECK: str q0, [sp, #64] -; CHECK: str w9, [sp, #80] for.body.lr.ph.i.i.i.i.i.i63: %b1 = alloca [10 x i32], align 16