diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4319,8 +4319,16 @@ continue; OrderedScalars.push_back(Inst); } - llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { - return DT->dominates(B, A); + llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { + auto *NodeA = DT->getNode(A->getParent()); + auto *NodeB = DT->getNode(B->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA != NodeB) + return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); + return B->comesBefore(A); }); for (Instruction *Inst : OrderedScalars) { @@ -5711,10 +5719,11 @@ // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - llvm::stable_sort(CSEWorkList, - [this](const DomTreeNode *A, const DomTreeNode *B) { - return DT->properlyDominates(A, B); - }); + llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { + assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + return A->getDFSNumIn() < B->getDFSNumIn(); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the @@ -6614,6 +6623,9 @@ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. + // Update DFS numbers now so that we can use them for ordering. + DT->updateDFSNumbers(); + // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=i386-linux-gnu -slp-vectorizer -S %s | FileCheck %s + +%struct.a = type { [2 x i64] } + +@a = external global %struct.a +@b = external global %struct.a +@c = external global %struct.a + +define void @f(i1 %x) #0 { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8 +; CHECK-NEXT: br i1 [[X:%.*]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]] +; CHECK: while.body.lr.ph: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP3]], i1 [[ICMP_A1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5]], [[WHILE_BODY_LR_PH]] ] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @c to <2 x i64>*), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[ICMP_D0:%.*]] = icmp eq i64 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[ICMP_D0]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[AND0_TMP:%.*]] = and i64 [[TMP8]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[AND0_TMP]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i64> [[TMP11]], [[TMP7]] +; CHECK-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %a0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 0), align 8 + %a1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 1), align 8 + br i1 %x, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: + %icmp.a1 = icmp eq i64 %a1, 0 + %b0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @b, i32 0, i32 0, i32 0), align 8 + %b1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @b, i32 0, i32 0, i32 1), align 8 + %c0 = select i1 %icmp.a1, i64 %b0, i64 %a0 + %c1 = select i1 %icmp.a1, i64 %b1, i64 %a1 + br label %while.end + +while.end: + %d0 = phi i64 [ %a0, %entry ], [ %c0, %while.body.lr.ph ] + %d1 = phi i64 [ %a1, %entry ], [ %c1, %while.body.lr.ph ] + %e0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @c, i32 0, i32 0, i32 0), align 8 + %e1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @c, i32 0, i32 0, i32 1), align 8 + %icmp.d0 = icmp eq i64 %d0, 0 + br i1 %icmp.d0, label %if.end, label %if.then + +if.then: + %and0.tmp = and i64 %d0, 8 + %and0 = and i64 %and0.tmp, %e0 + %and1 = and i64 %e1, %d1 + store i64 %and0, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 0), align 8 + store i64 %and1, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 1), align 8 + br label %if.end + +if.end: + ret void +} + +attributes #0 = { "target-features"="+sse2" }