diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8674,16 +8674,103 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; + // Sort by type, base pointers and values operand. Value operands must be + // compatible (have the same opcode, same parent), otherwise it is + // definitely not profitable to try to vectorize them. + auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { + if (V->getPointerOperandType()->getTypeID() < + V2->getPointerOperandType()->getTypeID()) + return true; + if (V->getPointerOperandType()->getTypeID() > + V2->getPointerOperandType()->getTypeID()) + return false; + // UndefValues are compatible with all other values. + if (isa(V->getValueOperand()) || + isa(V2->getValueOperand())) + return false; + if (auto *I1 = dyn_cast(V->getValueOperand())) + if (auto *I2 = dyn_cast(V2->getValueOperand())) { + DomTreeNodeBase *NodeI1 = + DT->getNode(I1->getParent()); + DomTreeNodeBase *NodeI2 = + DT->getNode(I2->getParent()); + assert(NodeI1 && "Should only process reachable instructions"); + assert(NodeI1 && "Should only process reachable instructions"); + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + return false; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa(V->getValueOperand()) && + isa(V2->getValueOperand())) + return false; + return V->getValueOperand()->getValueID() < + V2->getValueOperand()->getValueID(); + }; + + auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { + if (V1 == V2) + return true; + if (V1->getPointerOperandType() != V2->getPointerOperandType()) + return false; + // Undefs are compatible with any other value. + if (isa(V1->getValueOperand()) || + isa(V2->getValueOperand())) + return true; + if (auto *I1 = dyn_cast(V1->getValueOperand())) + if (auto *I2 = dyn_cast(V2->getValueOperand())) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + return S.getOpcode() > 0; + } + if (isa(V1->getValueOperand()) && + isa(V2->getValueOperand())) + return true; + return V1->getValueOperand()->getValueID() == + V2->getValueOperand()->getValueID(); + }; + // Attempt to sort and vectorize each of the store-groups. - for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; - ++it) { - if (it->second.size() < 2) + for (auto &Pair : Stores) { + if (Pair.second.size() < 2) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " - << it->second.size() << ".\n"); + << Pair.second.size() << ".\n"); + + stable_sort(Pair.second, StoreSorter); + + // Try to vectorize elements based on their compatibility. + for (ArrayRef::iterator IncIt = Pair.second.begin(), + E = Pair.second.end(); + IncIt != E;) { + + // Look for the next elements with the same type. + ArrayRef::iterator SameTypeIt = IncIt; + Type *EltTy = (*IncIt)->getPointerOperand()->getType(); - Changed |= vectorizeStores(it->second, R); + while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) + ++SameTypeIt; + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores (" + << NumElts << ")\n"); + if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && + vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { + // Success start over because instructions might have been changed. + Changed = true; + } + + // Start over at the next instruction of a different type (or the end). + IncIt = SameTypeIt; + } } return Changed; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -19,13 +19,16 @@ ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: ; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5 @@ -37,11 +40,11 @@ ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_10]], align 4 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_11]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0