Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8844,13 +8844,10 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } -static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode &St, - SDValue SplatVal, unsigned NumVecElts) { - assert((NumVecElts == 4 || NumVecElts == 2) && "Unexpected NumVecElts"); - +static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, + SDValue SplatVal, unsigned NumVecElts) { unsigned OrigAlignment = St.getAlignment(); - unsigned EltOffset = NumVecElts == 4 ? 4 : 8; - unsigned Alignment = std::min(OrigAlignment, EltOffset); + unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; // Create scalar stores. This is at least as good as the code sequence for a // split unaligned store which is a dup.s, ext.b, and two stores. @@ -8860,10 +8857,11 @@ SDValue BasePtr = St.getBasePtr(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(), - St.getAlignment(), St.getMemOperand()->getFlags()); + OrigAlignment, St.getMemOperand()->getFlags()); unsigned Offset = EltOffset; while (--NumVecElts) { + unsigned Alignment = MinAlign(OrigAlignment, Offset); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, @@ -8893,9 +8891,13 @@ SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); - // We can express a splat as store pair(s) for 2 or 4 elements. + // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or + // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); - if (NumVecElts != 4 && NumVecElts != 2) + if (!(((NumVecElts == 2 || NumVecElts == 3) && + VT.getVectorElementType().getSizeInBits() == 64) || + ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && + VT.getVectorElementType().getSizeInBits() == 32))) return SDValue(); if (StVal.getOpcode() != ISD::BUILD_VECTOR) @@ -8917,16 +8919,16 @@ for (int I = 0; I < NumVecElts; ++I) { SDValue EltVal = StVal.getOperand(I); - if (!isa(EltVal) || - !cast(EltVal)->isNullValue()) + if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) return SDValue(); } + // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from // undoing this transformation. - return split16BStoreSplat( - DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32) - : DAG.getRegister(AArch64::XZR, MVT::i64), - NumVecElts); + SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32 + ? DAG.getRegister(AArch64::WZR, MVT::i32) + : DAG.getRegister(AArch64::XZR, MVT::i64); + return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } /// Replace a splat of a scalar to a vector store by scalar stores of the scalar @@ -8979,12 +8981,12 @@ if (IndexNotInserted.any()) return SDValue(); - return split16BStoreSplat(DAG, St, SplatVal, NumVecElts); + return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } -static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -9174,7 +9176,7 @@ TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) + if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && Index: llvm/trunk/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -6174,11 +6174,10 @@ } ; Check for dependencies between the vector and the scalar load. -define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2) { +define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2, <4 x float> %vec) { ; CHECK-LABEL: test_v4f32_post_reg_ld1lane_dep_vec_on_load: ; CHECK: BB#0: ; CHECK-NEXT: ldr s[[LD:[0-9]+]], [x0] -; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: str q0, [x3] ; CHECK-NEXT: ldr q0, [x4] ; CHECK-NEXT: ins.s v0[1], v[[LD]][0] @@ -6186,7 +6185,7 @@ ; CHECK-NEXT: str [[POST]], [x1] ; CHECK-NEXT: ret %tmp1 = load float, float* %bar - store <4 x float> zeroinitializer, <4 x float>* %dep_ptr_1, align 16 + store <4 x float> %vec, <4 x float>* %dep_ptr_1, align 16 %A = load <4 x float>, <4 x float>* %dep_ptr_2, align 16 %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 %tmp3 = getelementptr float, float* %bar, i64 %inc Index: llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll @@ -1433,6 +1433,62 @@ ret void } +; Like merge_zr32, but with 2-vector type. +define void @merge_zr32_2vec(<2 x i32>* %p) { +; CHECK-LABEL: merge_zr32_2vec: +; CHECK: // %entry +; CHECK-NEXT: str xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <2 x i32> zeroinitializer, <2 x i32>* %p + ret void +} + +; Like merge_zr32, but with 3-vector type. +define void @merge_zr32_3vec(<3 x i32>* %p) { +; CHECK-LABEL: merge_zr32_3vec: +; CHECK: // %entry +; CHECK-NEXT: str xzr, [x{{[0-9]+}}] +; CHECK-NEXT: str wzr, [x{{[0-9]+}}, #8] +; CHECK-NEXT: ret +entry: + store <3 x i32> zeroinitializer, <3 x i32>* %p + ret void +} + +; Like merge_zr32, but with 4-vector type. +define void @merge_zr32_4vec(<4 x i32>* %p) { +; CHECK-LABEL: merge_zr32_4vec: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <4 x i32> zeroinitializer, <4 x i32>* %p + ret void +} + +; Like merge_zr32, but with 2-vector float type. +define void @merge_zr32_2vecf(<2 x float>* %p) { +; CHECK-LABEL: merge_zr32_2vecf: +; CHECK: // %entry +; CHECK-NEXT: str xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <2 x float> zeroinitializer, <2 x float>* %p + ret void +} + +; Like merge_zr32, but with 4-vector float type. +define void @merge_zr32_4vecf(<4 x float>* %p) { +; CHECK-LABEL: merge_zr32_4vecf: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <4 x float> zeroinitializer, <4 x float>* %p + ret void +} + ; Similar to merge_zr32, but for 64-bit values. define void @merge_zr64(i64* %p) { ; CHECK-LABEL: merge_zr64: @@ -1464,3 +1520,38 @@ store i64 0, i64* %p3 ret void } + +; Like merge_zr64, but with 2-vector double type. +define void @merge_zr64_2vecd(<2 x double>* %p) { +; CHECK-LABEL: merge_zr64_2vecd: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <2 x double> zeroinitializer, <2 x double>* %p + ret void +} + +; Like merge_zr64, but with 3-vector i64 type. +define void @merge_zr64_3vec(<3 x i64>* %p) { +; CHECK-LABEL: merge_zr64_3vec: +; CHECK: // %entry +; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16] +; CHECK-NEXT: ret +entry: + store <3 x i64> zeroinitializer, <3 x i64>* %p + ret void +} + +; Like merge_zr64_2, but with 4-vector double type. +define void @merge_zr64_4vecd(<4 x double>* %p) { +; CHECK-LABEL: merge_zr64_4vecd: +; CHECK: // %entry +; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; CHECK-NEXT: ret +entry: + store <4 x double> zeroinitializer, <4 x double>* %p + ret void +}