Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -382,7 +382,7 @@ /// vector elements, try to merge them into one larger store. /// \return True if a merged store was created. bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes, - EVT MemVT, unsigned NumElem, + EVT MemVT, unsigned StoresToMerge, bool IsConstantSrc, bool UseVector); /// Merge consecutive store operations into a wide store. @@ -9730,16 +9730,16 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, - unsigned NumElem, bool IsConstantSrc, bool UseVector) { + unsigned StoresToMerge, bool IsConstantSrc, bool UseVector) { // Make sure we have something to merge. - if (NumElem < 2) + if (StoresToMerge < 2) return false; int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned EarliestNodeUsed = 0; - for (unsigned i=0; i < NumElem; ++i) { + for (unsigned i=0; i < StoresToMerge; ++i) { // Find a chain for the new wide-store operand. Notice that some // of the store nodes that we found may not be selected for inclusion // in the wide store. The chain we use needs to be the chain of the @@ -9754,9 +9754,16 @@ SDValue StoredVal; if (UseVector) { - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + bool IsVec = MemVT.isVector(); + unsigned Elts = StoresToMerge; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + // Get the type for the merged vector store. + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); + if (IsConstantSrc) { // A vector store with a constant source implies that the constant is // zero; we only handle merging stores of constant zeros because the zero @@ -9766,31 +9773,31 @@ StoredVal = DAG.getConstant(0, Ty); } else { SmallVector Ops; - for (unsigned i = 0; i < NumElem ; ++i) { + for (unsigned i = 0; i < StoresToMerge ; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = St->getValue(); - // All of the operands of a BUILD_VECTOR must have the same type. + // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type. if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); } - // Build the extracted vector elements back into a vector. - StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops); + StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, + DL, Ty, Ops); } } else { // We should always use a vector store when merging extracted vector // elements, so this path implies a store of constants. assert(IsConstantSrc && "Merged vector elements should use vector store"); - unsigned StoreBW = NumElem * ElementSizeBytes * 8; + unsigned StoreBW = StoresToMerge * ElementSizeBytes * 8; APInt StoreInt(StoreBW, 0); // Construct a single integer constant which is made of the smaller // constant inputs. bool IsLE = TLI.isLittleEndian(); - for (unsigned i = 0; i < NumElem ; ++i) { - unsigned Idx = IsLE ? (NumElem - 1 - i) : i; + for (unsigned i = 0; i < StoresToMerge ; ++i) { + unsigned Idx = IsLE ? (StoresToMerge - 1 - i) : i; StoreSDNode *St = cast(StoreNodes[Idx].MemNode); SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes*8; @@ -9817,7 +9824,7 @@ // Replace the first store with the new store CombineTo(EarliestOp, NewStore); // Erase all other stores. - for (unsigned i = 0; i < NumElem ; ++i) { + for (unsigned i = 0; i < StoresToMerge ; ++i) { if (StoreNodes[i].MemNode == EarliestOp) continue; StoreSDNode *St = cast(StoreNodes[i].MemNode); @@ -9840,25 +9847,35 @@ } bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { - EVT MemVT = St->getMemoryVT(); - int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - // Don't merge vectors into wider inputs. - if (MemVT.isVector() || !MemVT.isSimple()) - return false; - // Perform an early exit check. Do not bother looking at stored values that // are not constants, loads, or extracted vector elements. SDValue StoredVal = St->getValue(); bool IsLoadSrc = isa(StoredVal); bool IsConstantSrc = isa(StoredVal) || isa(StoredVal); - bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT); + bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); - if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc) + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) + return false; + + EVT MemVT = St->getMemoryVT(); + int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; + + // Don't merge vectors into wider vectors if the source data comes from loads. + // TODO: This restriction can be lifted by using logic similar to the + // ExtractVecSrc case. + // There's no such thing as a vector constant node; that merge case should be + // handled by looking through a BUILD_VECTOR source with all constant inputs. + if (MemVT.isVector() && IsLoadSrc) + return false; + + if (!MemVT.isSimple()) return false; + // Only look at ends of store sequences. SDValue Chain = SDValue(St, 0); @@ -10054,26 +10071,33 @@ // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. - if (IsExtractVecEltSrc) { - unsigned NumElem = 0; + if (IsExtractVecSrc) { + unsigned StoresToMerge = 0; + bool IsVec = MemVT.isVector(); for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); + unsigned StoreValOpcode = St->getValue().getOpcode(); // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a vector. // It should be possible to handle mixed sources, but load sources need // more careful handling (see the block of code below that handles // consecutive loads). - if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && + StoreValOpcode != ISD::EXTRACT_SUBVECTOR) return false; // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty)) - NumElem = i + 1; + unsigned Elts = i + 1; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + if (TLI.isTypeLegal(EVT::getVectorVT(*DAG.getContext(), + MemVT.getScalarType(), Elts))) + StoresToMerge = i + 1; } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, StoresToMerge, false, true); } Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -468,6 +468,64 @@ ; CHECK-NEXT: retq } +; PR21711 - Merge vector stores into wider vector stores. +define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) { + %idx0 = getelementptr inbounds <4 x float>* %ptr, i64 3 + %idx1 = getelementptr inbounds <4 x float>* %ptr, i64 4 + %idx2 = getelementptr inbounds <4 x float>* %ptr, i64 5 + %idx3 = getelementptr inbounds <4 x float>* %ptr, i64 6 + %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> + %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> + %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> + %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> + store <4 x float> %shuffle0, <4 x float>* %idx0, align 16 + store <4 x float> %shuffle1, <4 x float>* %idx1, align 16 + store <4 x float> %shuffle2, <4 x float>* %idx2, align 16 + store <4 x float> %shuffle3, <4 x float>* %idx3, align 16 + ret void + +; CHECK-LABEL: merge_vec_extract_stores +; CHECK: vmovups %ymm0, 48(%rdi) +; CHECK-NEXT: vmovups %ymm1, 80(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Merging vector stores when sourced from vector loads is not currently handled. +define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) { + %load_idx0 = getelementptr inbounds <4 x float>* %v, i64 0 + %load_idx1 = getelementptr inbounds <4 x float>* %v, i64 1 + %v0 = load <4 x float>* %load_idx0 + %v1 = load <4 x float>* %load_idx1 + %store_idx0 = getelementptr inbounds <4 x float>* %ptr, i64 0 + %store_idx1 = getelementptr inbounds <4 x float>* %ptr, i64 1 + store <4 x float> %v0, <4 x float>* %store_idx0, align 16 + store <4 x float> %v1, <4 x float>* %store_idx1, align 16 + ret void + +; CHECK-LABEL: merge_vec_stores_from_loads +; CHECK: vmovaps +; CHECK-NEXT: vmovaps +; CHECK-NEXT: vmovaps +; CHECK-NEXT: vmovaps +; CHECK-NEXT: retq +} + +; Merging vector stores when sourced from a constant vector is not currently handled. +define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) { + %idx0 = getelementptr inbounds <4 x i32>* %ptr, i64 3 + %idx1 = getelementptr inbounds <4 x i32>* %ptr, i64 4 + store <4 x i32> , <4 x i32>* %idx0, align 16 + store <4 x i32> , <4 x i32>* %idx1, align 16 + ret void + +; CHECK-LABEL: merge_vec_stores_of_constants +; CHECK: vxorps +; CHECK-NEXT: vmovaps +; CHECK-NEXT: vmovaps +; CHECK-NEXT: retq +} + ; This is a minimized test based on real code that was failing. ; We could merge stores (and loads) like this...