diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -781,6 +781,15 @@ Scalars[Mask[I]] = Prev[I]; } +/// Apply \p ReorderMask on \p Order. +static void applyReorder(SmallVectorImpl &Order, + ArrayRef ReorderMask) { + SmallVector OrigOrder(Order.begin(), Order.end()); + assert(Order.size() == ReorderMask.size() && "Expected same size"); + for (unsigned Idx : seq(0, Order.size())) + Order[Idx] = OrigOrder[ReorderMask[Idx]]; +} + /// Checks if the provided value does not require scheduling. It does not /// require scheduling if this is not an instruction or it is an instruction /// that does not read/write memory and all operands are either not instructions @@ -4011,6 +4020,17 @@ transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { return I < E ? static_cast(I) : UndefMaskElem; }); + // If the UserTE already has a ReorderingIndices mask we need to combine + // the user's mask with the current masks. + if (!Data.first->UserTreeIndices.empty()) { + assert(Data.first->UserTreeIndices.size() == 1 && + "Expected exactly one user"); + const EdgeInfo &EI = *Data.first->UserTreeIndices.begin(); + if (!EI.UserTE->ReorderIndices.empty()) { + applyReorder(MaskOrder, EI.UserTE->ReorderIndices); + applyReorder(Mask, EI.UserTE->ReorderIndices); + } + } for (const std::pair &Op : Data.second) { TreeEntry *TE = Op.second; OrderedEntries.remove(TE); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -S | FileCheck %s + +; This checks that reorderBottomToTop() can handle reordering of a TreeEntry +; which has a user TreeEntry that has already been reordered. +; Here is when the crash occurs: +; +; (N4)OrderB +; | +; (N1)OrderA (N2)OrderA (N3)NoOrder +; \ | / +; (Phi)NoOrder +; +; 1. Phi is visited along with its operands (N1,N2,N3). BestOrder is "OrderA". +; 2. Phi along with all its operands (N1,N2,N3) are reordered. The result is: +; +; (N4)OrderB +; | +; (N1)NoOrder (N2)NoOrder (N3)OrderA +; \ | / +; (Phi)OrderA +; +; 3. N3 is now visited along with its operand N4. BestOrder is "OrderB". +; 4. N3 and N4 are reordered. The result is this: +; +; (N4)NoOrder +; | +; (N1)NoOrder (N2)NoOrder (N3)OrderB +; \ | / +; (Phi)OrderA +; +; At this point there is a discrepancy between Phi's Operand 2 which are +; reordered based on OrderA and N3's OrderB. This results in a crash in +; vectorizeTree() on its way from N3 back to the Phi. The reason is that +; N3->isSame(Phi's operand 2) returns false and vectorizeTree() skips N3. +; +; This patch fixes N3's order by setting it to the order tha results from +; combining both OrderB and OrderA. +; +; NOTE: The crash shows up when reorderTopToBottom() does not reorder the tree, +; so to simulate this we add external store users. Alternatively one can +; comment out reorderTopToBottom() and remove the stores. + + +define void @reorder_crash(float* %ptr) { +; CHECK-LABEL: @reorder_crash( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0 +; CHECK-NEXT: br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb12: +; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP1]], [[BB0]] ], [ [[TMP4]], [[BB1]] ], [ [[SHUFFLE1]], [[BB2]] ] +; CHECK-NEXT: ret void +; +entry: + %gep0 = getelementptr inbounds float, float* %ptr, i64 0 + %gep1 = getelementptr inbounds float, float* %ptr, i64 1 + %gep2 = getelementptr inbounds float, float* %ptr, i64 2 + %gep3 = getelementptr inbounds float, float* %ptr, i64 3 + br i1 undef, label %bb0, label %bb12 + +bb0: + ; Used by phi in this order: 1, 0, 2, 3 + %ld00 = load float, float* %gep0 + %ld01 = load float, float* %gep1 + %ld02 = load float, float* %gep2 + %ld03 = load float, float* %gep3 + + ; External store users in natural order 0, 1, 2, 3 + store float %ld00, float *%gep0 + store float %ld01, float *%gep1 + store float %ld02, float *%gep2 + store float %ld03, float *%gep3 + br label %bb3 + +bb12: + br i1 undef, label %bb1, label %bb2 + +bb1: + ; Used by phi in this order: 1, 0, 2, 3 + %ld10 = load float, float* %gep0 + %ld11 = load float, float* %gep1 + %ld12 = load float, float* %gep2 + %ld13 = load float, float* %gep3 + + ; External store users in natural order 0, 1, 2, 3 + store float %ld10, float *%gep0 + store float %ld11, float *%gep1 + store float %ld12, float *%gep2 + store float %ld13, float *%gep3 + + br label %bb3 + +bb2: + ; Used by fadd in this order: 2, 3, 0, 1 + %ld20 = load float, float* %gep0 + %ld21 = load float, float* %gep1 + %ld22 = load float, float* %gep2 + %ld23 = load float, float* %gep3 + + ; Used by phi in this order: 0, 1, 2, 3 + %add20 = fadd float %ld22, 0.0 + %add21 = fadd float %ld23, 0.0 + %add22 = fadd float %ld20, 0.0 + %add23 = fadd float %ld21, 0.0 + br label %bb3 + +bb3: + %phi0 = phi float [ %ld01, %bb0 ], [ %ld11, %bb1 ], [ %add20, %bb2 ] + %phi1 = phi float [ %ld00, %bb0 ], [ %ld10, %bb1 ], [ %add21, %bb2 ] + %phi2 = phi float [ %ld02, %bb0 ], [ %ld12, %bb1 ], [ %add22, %bb2 ] + %phi3 = phi float [ %ld03, %bb0 ], [ %ld13, %bb1 ], [ %add23, %bb2 ] + ret void +}