diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3797,6 +3797,51 @@ return None; } +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector( + InsertElementInst *VU, InsertElementInst *V, + function_ref GetBaseOperand) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + Optional Idx1 = getInsertIndex(IE1); + Optional Idx2 = getInsertIndex(IE2); + if (Idx1 == None || Idx2 == None) + return false; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU) + return VU->hasOneUse(); + if (IE1 == V) + return V->hasOneUse(); + if (IE1) { + if ((IE1 != VU && !IE1->hasOneUse()) || + getInsertIndex(IE1).value_or(*Idx2) == *Idx2) + IE1 = nullptr; + else + IE1 = dyn_cast_or_null(GetBaseOperand(IE1)); + } + if (IE2) { + if ((IE2 != V && !IE2->hasOneUse()) || + getInsertIndex(IE2).value_or(*Idx1) == *Idx1) + IE2 = nullptr; + else + IE2 = dyn_cast_or_null(GetBaseOperand(IE2)); + } + } while (IE1 || IE2); + return false; +} + Optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -3860,6 +3905,58 @@ (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) return TE.ReorderIndices; + if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { + auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) { + if (!V1->hasOneUse() || !V2->hasOneUse()) + return false; + auto *FirstUserOfPhi1 = cast(*V1->user_begin()); + auto *FirstUserOfPhi2 = cast(*V2->user_begin()); + if (auto *IE1 = dyn_cast(FirstUserOfPhi1)) + if (auto *IE2 = dyn_cast(FirstUserOfPhi2)) { + if (!areTwoInsertFromSameBuildVector( + IE1, IE2, + [](InsertElementInst *II) { return II->getOperand(0); })) + return false; + Optional Idx1 = getInsertIndex(IE1); + Optional Idx2 = getInsertIndex(IE2); + if (Idx1 == None || Idx2 == None) + return false; + return *Idx1 < *Idx2; + } + if (auto *EE1 = dyn_cast(FirstUserOfPhi1)) + if (auto *EE2 = dyn_cast(FirstUserOfPhi2)) { + if (EE1->getOperand(0) != EE2->getOperand(0)) + return false; + Optional Idx1 = getExtractIndex(EE1); + Optional Idx2 = getExtractIndex(EE2); + if (Idx1 == None || Idx2 == None) + return false; + return *Idx1 < *Idx2; + } + return false; + }; + auto IsIdentityOrder = [](const OrdersType &Order) { + for (unsigned Idx : seq(0, Order.size())) + if (Idx != Order[Idx]) + return false; + return true; + }; + if (!TE.ReorderIndices.empty()) + return TE.ReorderIndices; + DenseMap PhiToId; + SmallVector Phis; + OrdersType ResOrder(TE.Scalars.size()); + for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) { + PhiToId[TE.Scalars[Id]] = Id; + Phis.push_back(TE.Scalars[Id]); + } + llvm::stable_sort(Phis, PHICompare); + for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id) + ResOrder[Id] = PhiToId[Phis[Id]]; + if (IsIdentityOrder(ResOrder)) + return {}; + return ResOrder; + } if (TE.State == TreeEntry::NeedToGather) { // TODO: add analysis of other gather nodes with extractelement // instructions and other values/instructions, not only undefs. @@ -3937,6 +4034,9 @@ // their ordering. DenseMap GathersToOrders; + // Phi nodes can have preferred ordering based on their result users + DenseMap PhisToOrders; + // AltShuffles can also have a preferred ordering that leads to fewer // instructions, e.g., the addsub instruction in x86. DenseMap AltShufflesToOrders; @@ -3951,7 +4051,7 @@ // extracts. for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries, &GathersToOrders, &ExternalUserReorderMap, - &AltShufflesToOrders]( + &AltShufflesToOrders, &PhisToOrders]( const std::unique_ptr &TE) { // Look for external users that will probably be vectorized. SmallVector ExternalUserReorderIndices = @@ -4008,6 +4108,9 @@ VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + if (TE->State == TreeEntry::Vectorize && + TE->getOpcode() == Instruction::PHI) + PhisToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -4033,8 +4136,8 @@ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE)) continue; // Count number of orders uses. - const auto &Order = [OpTE, &GathersToOrders, - &AltShufflesToOrders]() -> const OrdersType & { + const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, + &PhisToOrders]() -> const OrdersType & { if (OpTE->State == TreeEntry::NeedToGather || !OpTE->ReuseShuffleIndices.empty()) { auto It = GathersToOrders.find(OpTE); @@ -4046,6 +4149,12 @@ if (It != AltShufflesToOrders.end()) return It->second; } + if (OpTE->State == TreeEntry::Vectorize && + OpTE->getOpcode() == Instruction::PHI) { + auto It = PhisToOrders.find(OpTE); + if (It != PhisToOrders.end()) + return It->second; + } return OpTE->ReorderIndices; }(); // First consider the order of the external scalar users. @@ -7140,51 +7249,6 @@ return Cost; } -/// Check if two insertelement instructions are from the same buildvector. -static bool areTwoInsertFromSameBuildVector( - InsertElementInst *VU, InsertElementInst *V, - function_ref GetBaseOperand) { - // Instructions must be from the same basic blocks. - if (VU->getParent() != V->getParent()) - return false; - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - // Multiple used inserts are separate nodes. - if (!VU->hasOneUse() && !V->hasOneUse()) - return false; - auto *IE1 = VU; - auto *IE2 = V; - Optional Idx1 = getInsertIndex(IE1); - Optional Idx2 = getInsertIndex(IE2); - if (Idx1 == None || Idx2 == None) - return false; - // Go through the vector operand of insertelement instructions trying to find - // either VU as the original vector for IE2 or V as the original vector for - // IE1. - do { - if (IE2 == VU) - return VU->hasOneUse(); - if (IE1 == V) - return V->hasOneUse(); - if (IE1) { - if ((IE1 != VU && !IE1->hasOneUse()) || - getInsertIndex(IE1).value_or(*Idx2) == *Idx2) - IE1 = nullptr; - else - IE1 = dyn_cast_or_null(GetBaseOperand(IE1)); - } - if (IE2) { - if ((IE2 != V && !IE2->hasOneUse()) || - getInsertIndex(IE2).value_or(*Idx1) == *Idx1) - IE2 = nullptr; - else - IE2 = dyn_cast_or_null(GetBaseOperand(IE2)); - } - } while (IE1 || IE2); - return false; -} - /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the /// buildvector sequence. static bool isFirstInsertElement(const InsertElementInst *IE1, diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll @@ -63,8 +63,8 @@ ; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x half> [[IN1]], i64 1 ; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x half> [[IN1]], i64 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[A1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> [[TMP0]], half [[A0]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[A0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x half> [[TMP0]], half [[A1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[A2]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[A3]], i32 1 ; CHECK-NEXT: br i1 [[CMP:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] @@ -73,15 +73,15 @@ ; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x half> [[IN2]], i64 1 ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x half> [[IN2]], i64 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x half> poison, half [[B1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[B0]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x half> poison, half [[B0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[B1]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x half> poison, half [[B2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x half> [[TMP6]], half [[B3]], i32 1 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x half> [ [[TMP1]], %entry ], [ [[TMP5]], %bb0 ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x half> [ [[TMP3]], %entry ], [ [[TMP7]], %bb0 ] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x half> [[TMP8]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x half> [[TMP8]], <2 x half> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x half> [[TMP9]], <2 x half> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x i32> ; CHECK-NEXT: ret <4 x half> [[TMP12]]