Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10946,8 +10946,7 @@ if (ElementSizeBytes * 8 != MemVT.getSizeInBits()) return false; - // Don't merge vectors into wider inputs. - if (MemVT.isVector() || !MemVT.isSimple()) + if (!MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that @@ -10956,9 +10955,16 @@ bool IsLoadSrc = isa(StoredVal); bool IsConstantSrc = isa(StoredVal) || isa(StoredVal); - bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT); + bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); - if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc) + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) + return false; + + // Don't merge vectors into wider vectors if the source data comes from loads. + // TODO: This restriction can be lifted by using logic similar to the + // ExtractVecSrc case. + if (MemVT.isVector() && IsLoadSrc) return false; // Only look at ends of store sequences. @@ -11091,29 +11097,36 @@ // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. - if (IsExtractVecEltSrc) { - unsigned NumElem = 0; + if (IsExtractVecSrc) { + unsigned NumStoresToMerge = 0; + bool IsVec = MemVT.isVector(); for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); + unsigned StoreValOpcode = St->getValue().getOpcode(); // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a vector. // It should be possible to handle mixed sources, but load sources need // more careful handling (see the block of code below that handles // consecutive loads). - if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && + StoreValOpcode != ISD::EXTRACT_SUBVECTOR) return false; // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); + unsigned Elts = i + 1; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); bool IsFast; if (TLI.isTypeLegal(Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) - NumElem = i + 1; + NumStoresToMerge = i + 1; } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge, false, true); } Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -795,9 +795,25 @@ bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; - // FIXME: True for Cyclone, but not necessary others. - if (Fast) - *Fast = true; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getSizeInBits() != 128 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } return true; } @@ -8434,6 +8450,10 @@ if (S->isVolatile()) return SDValue(); + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) Index: test/CodeGen/AArch64/merge-store.ll =================================================================== --- test/CodeGen/AArch64/merge-store.ll +++ test/CodeGen/AArch64/merge-store.ll @@ -1,4 +1,5 @@ ; RUN: llc -march aarch64 %s -o - | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 @@ -18,3 +19,32 @@ store float %tmp9, float* %tmp7 ret void; } + + +; PR21711 - Merge vector stores into wider vector stores. + +; On Cyclone, the stores should not get merged into a 16-byte store because +; unaligned 16-byte stores are slow. This test would infinite loop when +; the fastness of unaligned accesses was not specified correctly. + +define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) { + %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 + %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4 + + %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> + %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> + + store <2 x float> %shuffle0, <2 x float>* %idx0, align 8 + store <2 x float> %shuffle1, <2 x float>* %idx1, align 8 + ret void + +; CHECK-LABEL: merge_vec_extract_stores +; CHECK: stur q0, [x0, #24] +; CHECK-NEXT: ret + +; CYCLONE-LABEL: merge_vec_extract_stores +; CYCLONE: ext v1.16b, v0.16b, v0.16b, #8 +; CYCLONE-NEXT: str d0, [x0, #24] +; CYCLONE-NEXT: str d1, [x0, #32] +; CYCLONE-NEXT: ret +} Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -478,10 +478,8 @@ ret void ; CHECK-LABEL: merge_vec_extract_stores -; CHECK: vmovaps %xmm0, 48(%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm0, 64(%rdi) -; CHECK-NEXT: vmovaps %xmm1, 80(%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm1, 96(%rdi) +; CHECK: vmovups %ymm0, 48(%rdi) +; CHECK-NEXT: vmovups %ymm1, 80(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq }