diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3410,6 +3410,10 @@ bool LegalOperations, bool ForCodeSize, unsigned Depth = 0) const; + /// If it's profitable to split vector store into individual stores. + virtual bool isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit, + SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Lowering methods - These methods must be implemented by targets so that // the SelectionDAGBuilder code knows how to lower these. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -547,6 +547,7 @@ SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue MatchStoreCombine(StoreSDNode *N); + SDValue MatchVectorStoreSplit(StoreSDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -6612,6 +6613,211 @@ return NewStore; } +/// getVectorUpdateChain - Detect a load-insert/shuffle-store chain. Returns +/// head LOAD if we met such pattern. Otherwise, this function returns null +/// SDValue. The path here contains both nodes on the path and respective +/// 'next' node index we should visit. +static SDValue +getVectorUpdatePath(SDValue Current, + SmallVectorImpl> &Path) { + if (Current.getOpcode() == ISD::LOAD) + return SDValue(); + + const unsigned MaxDepth = 12; + unsigned Depth = 0; + + // Here we use DFS to find the LOAD we desire. This buffer keeps nodes + // we've not visited. If current path is wrong, back and pick another one. + SmallVector SearchBuffer; + + while (Current) { + if (Depth >= MaxDepth) + return SDValue(); + + if (Current.getOpcode() == ISD::LOAD) + return Current; + else if (Current.getOpcode() == ISD::INSERT_VECTOR_ELT) { + // For INSERT_ELT nodes, we only have one path. + ++Depth; + Path.push_back(std::make_pair(Current, 0)); + Current = Current.getOperand(0); + } else if (Current.getOpcode() == ISD::VECTOR_SHUFFLE) { + // For VECTOR_SHUFFLE, we pick the first non-constant and push + // the rest into buffer. + ++Depth; + SDValue Op1 = Current.getOperand(0); + SDValue Op2 = Current.getOperand(1); + + // Since we're desiring a LOAD at root of the path, constant vector + // (BUILD_VECTOR) operand in shuffle instr can't be what we want. + if (Op1.getOpcode() == ISD::BUILD_VECTOR) { + Path.push_back(std::make_pair(Current, 1)); + Current = Op2; + } else { + Path.push_back(std::make_pair(Current, 0)); + Current = Op1; + if (Op2.getOpcode() != ISD::BUILD_VECTOR) + SearchBuffer.push_back(Op2); + } + } else { + if (SearchBuffer.empty()) + return SDValue(); + + // Go back until we find top of buffer is 'brother' of current node. + // And pick it from buffer. + while (!Path.empty()) { + std::pair Top = Path.pop_back_val(); + if (Top.first.getOpcode() != ISD::VECTOR_SHUFFLE) + continue; + unsigned NewPath = !Top.second; + if (Top.first.getOperand(NewPath) == SearchBuffer.back()) { + SearchBuffer.pop_back(); + Path.push_back(std::make_pair(Top.first, NewPath)); + Current = Top.first.getOperand(NewPath); + break; + } + } + } + } + + return SDValue(); +} + +/// getVectorUpdates - Update a state table of elements generated by +/// a load-insert/shuffle-store chain. We can use this table to simplify +/// vector store code. +static bool +getVectorUpdates(SmallVectorImpl> &States, + SmallVectorImpl> &Worklist, + const SDValue &Source) { + bool Changed = false; + auto VecLen = Source.getValueType().getVectorNumElements(); + SDValue LastItem; + + // Each element in States should be one of: + // - The first is default SDValue if it's undef. + // - The first is scalar if it's a value representing scalar. + // - The first is vector and second is index if it's element from a vector. + + while (!Worklist.empty()) { + auto Tail = Worklist.pop_back_val(); + + // INSERT only changes one element, so just change entry for that one. + if (Tail.first.getOpcode() == ISD::INSERT_VECTOR_ELT) { + auto IndexNode = dyn_cast(Tail.first.getOperand(2)); + if (IndexNode == nullptr) + return false; + auto IndexValue = IndexNode->getZExtValue(); + if (IndexValue >= VecLen) + return false; + States[IndexValue].first = Tail.first.getOperand(1); + Changed = true; + } else if (Tail.first.getOpcode() == ISD::VECTOR_SHUFFLE) { + // Elements may got exchanged. So we copy the table and refer + // to the original ones when updating. + const ShuffleVectorSDNode *SVN = + dyn_cast(Tail.first.getNode()); + SmallVector, 16> OriginalStates(States.begin(), + States.end()); + // We don't support shuffle which changes vector length. + auto MaskSize = SVN->getMask().size(); + if (MaskSize != VecLen) + return false; + + for (unsigned i = 0; i < MaskSize; i++) { + int Mask = SVN->getMaskElt(i); + if (Mask < 0) + States[i].first = SDValue(); + else { + SDValue ChoosenVec = SVN->getOperand(Mask >= (int)VecLen); + States[i].second = (Mask < (int)VecLen) ? Mask : Mask - VecLen; + + // Keep previous set vector if shuffle doesn't change this. + if (ChoosenVec != LastItem) + States[i].first = ChoosenVec; + } + + if (OriginalStates[i] != States[i]) + Changed = true; + } + } + + LastItem = Tail.first; + } + + return Changed; +} + +/// Match a pattern where only a few elements of a vector are updated but the +/// whole vector is stored. This method will recognize a chain in which vector +/// is updated, and split the store into multiple element stores. +SDValue DAGCombiner::MatchVectorStoreSplit(StoreSDNode *N) { + SmallVector, 16> Buf; + SmallVector, 16> States; + + // We don't support scalable vectors now. + EVT VecType = N->getValue().getValueType(); + EVT EleVT = VecType.getScalarType(); + + if (!N->isSimple() || VecType.isScalableVector() || !VecType.isVector()) + return SDValue(); + + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), EleVT) || + !hasOperation(N->getOpcode(), EleVT)) + return SDValue(); + + SDValue SourceVecLoad = getVectorUpdatePath(N->getValue(), Buf); + LoadSDNode *LoadNode = dyn_cast_or_null(SourceVecLoad.getNode()); + if (!LoadNode || !LoadNode->isSimple()) + return SDValue(); + + // Prepare a table, initialized by the vector and its index. So we can find + // which elements got updated in the chain. + auto VecLen = SourceVecLoad.getValueType().getVectorNumElements(); + for (unsigned i = 0; i < VecLen; ++i) + States.push_back(std::make_pair(SourceVecLoad, i)); + + if (!getVectorUpdates(States, Buf, SourceVecLoad)) + return SDValue(); + + // Go over the updated table to find updated elements. + SmallVector UpdatedElementsIdx; + for (unsigned i = 0; i < States.size(); ++i) { + SDValue &State = States[i].first; + if (!State) + continue; + + // It's not profitable to extract from another vector and insert. + if (State.getValueType().isVector()) { + if (State.getOpcode() == ISD::BUILD_VECTOR) { + State = State.getOperand(States[i].second); + UpdatedElementsIdx.push_back(i); + } else if (State != SourceVecLoad || States[i].second != (int)i) + return SDValue(); + } else + UpdatedElementsIdx.push_back(i); + } + + if (TLI.isCheapToSplitStore(N, UpdatedElementsIdx.size(), DAG)) { + SDLoc Loc = SDLoc(N); + SDValue Index, Chain; + + // Insert new stores, based on vector's address. + for (unsigned i = 0; i < UpdatedElementsIdx.size(); ++i) { + Index = TLI.getVectorElementPointer( + DAG, N->getBasePtr(), VecType, + DAG.getConstant(UpdatedElementsIdx[i], Loc, MVT::i32)); + Chain = + DAG.getStore(Chain ? Chain : N->getChain(), Loc, + States[UpdatedElementsIdx[i]].first, + Index, N->getPointerInfo()); + } + return Chain; + } + + return SDValue(); +} + /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load /// and a BSWAP if the targets supports it. @@ -16192,6 +16398,9 @@ if (SDValue Store = MatchStoreCombine(ST)) return Store; + if (SDValue Split = MatchVectorStoreSplit(ST)) + return Split; + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5580,6 +5580,11 @@ llvm_unreachable("Unknown code"); } +bool TargetLowering::isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit, + SelectionDAG &DAG) const { + return false; +} + //===----------------------------------------------------------------------===// // Legalization Utilities //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -646,6 +646,9 @@ return true; } + bool isCheapToSplitStore(StoreSDNode *N, unsigned NumSplit, + SelectionDAG &DAG) const override; + bool isCtlzFast() const override { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1598,6 +1598,17 @@ return true; } +bool PPCTargetLowering::isCheapToSplitStore(StoreSDNode *N, + unsigned NumSplit, + SelectionDAG &DAG) const { + EVT VecType = N->getValue().getValueType(); + + if (!VecType.isVector() || NumSplit >= VecType.getVectorNumElements()) + return false; + + return true; +} + /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). /// The ShuffleKind distinguishes between big-endian merges with two diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-5.ll b/llvm/test/CodeGen/PowerPC/swaps-le-5.ll --- a/llvm/test/CodeGen/PowerPC/swaps-le-5.ll +++ b/llvm/test/CodeGen/PowerPC/swaps-le-5.ll @@ -10,7 +10,8 @@ entry: %0 = load <2 x double>, <2 x double>* @x, align 16 %vecins = insertelement <2 x double> %0, double %y, i32 0 - store <2 x double> %vecins, <2 x double>* @z, align 16 + ; Add volatile to avoid vector store split optimization. + store volatile <2 x double> %vecins, <2 x double>* @z, align 16 ret void } @@ -25,7 +26,7 @@ entry: %0 = load <2 x double>, <2 x double>* @x, align 16 %vecins = insertelement <2 x double> %0, double %y, i32 1 - store <2 x double> %vecins, <2 x double>* @z, align 16 + store volatile <2 x double> %vecins, <2 x double>* @z, align 16 ret void } @@ -57,7 +58,7 @@ %0 = load <2 x double>, <2 x double>* @z, align 16 %1 = load <2 x double>, <2 x double>* @x, align 16 %vecins = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> - store <2 x double> %vecins, <2 x double>* @z, align 16 + store volatile <2 x double> %vecins, <2 x double>* @z, align 16 ret void } diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-6.ll b/llvm/test/CodeGen/PowerPC/swaps-le-6.ll --- a/llvm/test/CodeGen/PowerPC/swaps-le-6.ll +++ b/llvm/test/CodeGen/PowerPC/swaps-le-6.ll @@ -60,7 +60,8 @@ %0 = load <2 x double>, <2 x double>* @x, align 16 %1 = load double, double* @y, align 8 %vecins = insertelement <2 x double> %0, double %1, i32 0 - store <2 x double> %vecins, <2 x double>* @z, align 16 + ; Add volatile to avoid vector store split optimization. + store volatile <2 x double> %vecins, <2 x double>* @z, align 16 ret void } @@ -105,7 +106,7 @@ %0 = load <2 x double>, <2 x double>* @x, align 16 %1 = load double, double* @y, align 8 %vecins = insertelement <2 x double> %0, double %1, i32 1 - store <2 x double> %vecins, <2 x double>* @z, align 16 + store volatile <2 x double> %vecins, <2 x double>* @z, align 16 ret void } diff --git a/llvm/test/CodeGen/PowerPC/vector-store-split.ll b/llvm/test/CodeGen/PowerPC/vector-store-split.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-store-split.ll @@ -0,0 +1,83 @@ +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=LE,CHECK +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec < %s | FileCheck %s --check-prefixes=BE,CHECK + +; CHECK-LABEL: insert_store: +; CHECK: stw 4, 12(3) +; CHECK-NEXT: blr +define void @insert_store(<4 x i32>* %q, i32 zeroext %s) { +entry: + %0 = load <4 x i32>, <4 x i32>* %q, align 16 + %vecins = insertelement <4 x i32> %0, i32 %s, i32 3 + store <4 x i32> %vecins, <4 x i32>* %q, align 16 + ret void +} + +; CHECK-LABEL: volatile_check: +; LE: vperm +; BE: vsldoi +; CHECK: stvx +; CHECK: blr +define void @volatile_check(<4 x i32>* %q, <4 x i32>* %p, i32 zeroext %s) { +entry: + %0 = load <4 x i32>, <4 x i32>* %q, align 16 + %vecins = insertelement <4 x i32> %0, i32 %s, i32 3 + store volatile <4 x i32> %vecins, <4 x i32>* %q, align 16 + %1 = load volatile <4 x i32>, <4 x i32>* %p, align 16 + %vecins1 = insertelement <4 x i32> %1, i32 %s, i32 3 + store <4 x i32> %vecins1, <4 x i32>* %p, align 16 + ret void +} + +; CHECK-LABEL: shuffle_store: +; CHECK: li 4, 100 +; LE-NEXT: li 5, 300 +; LE-NEXT: stw 4, 12(3) +; LE-NEXT: stw 5, 4(3) +; BE-NEXT: stw 4, 12(3) +; BE-NEXT: li 4, 300 +; BE-NEXT: stw 4, 4(3) +; CHECK-NEXT: blr +define void @shuffle_store(<4 x i32>* %q) { +entry: + %0 = load <4 x i32>, <4 x i32>* %q, align 16 + %vecins3 = shufflevector <4 x i32> %0, <4 x i32> , <4 x i32> + store <4 x i32> %vecins3, <4 x i32>* %q, align 16 + ret void +} + +; CHECK-LABEL: chain_shuffle: +; CHECK: lwz 4, 0(4) +; CHECK-NEXT: li 5, 2 +; CHECK-NEXT: stw 5, 4(3) +; CHECK-NEXT: stw 4, 8(3) +; CHECK-NEXT: blr +define void @chain_shuffle(<4 x i32>* %q, i32* %m, i32 zeroext %s) { +entry: + %0 = load <4 x i32>, <4 x i32>* %q, align 16 + %1 = load i32, i32* %m, align 4 + %vecins0 = insertelement <4 x i32> %0, i32 %s, i32 1 + %vecins1 = shufflevector <4 x i32> %vecins0, <4 x i32> , <4 x i32> + %vecins2 = insertelement <4 x i32> %vecins1, i32 %1, i32 2 + store <4 x i32> %vecins2, <4 x i32>* %q, align 16 + ret void +} + +; CHECK-LABEL: exchange_elements: +; LE: mtvsrd +; LE: vperm +; LE: xxswapd +; LE: lvx +; LE: vperm +; LE: stvx +; BE-COUNT-3: lvx +; BE-COUNT-4: vsldoi +; BE: stvx +define void @exchange_elements(<4 x i32>* %a, <4 x i32>* %b) { +entry: + %0 = load <4 x i32>, <4 x i32>* %a, align 16 + %1 = load <4 x i32>, <4 x i32>* %b, align 16 + %vecins0 = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> + %vecins1 = insertelement <4 x i32> %vecins0, i32 5, i32 0 + store <4 x i32> %vecins1, <4 x i32>* %a, align 16 + ret void +}