diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7232,23 +7232,47 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle - // should be in this block. + // should be in this block (except for extractelement-like instructions with + // constant indeces). auto *Front = E->getMainOp(); auto *BB = Front->getParent(); assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { auto *I = cast(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + return !E->isOpcodeOrAlt(I) || I->getParent() == BB || + isVectorLikeInstWithConstOps(I); })); - auto &&FindLastInst = [E, Front]() { + auto &&FindLastInst = [E, Front, this, &BB]() { Instruction *LastInst = Front; for (Value *V : E->Scalars) { auto *I = dyn_cast(V); if (!I) continue; - if (LastInst->comesBefore(I)) + if (LastInst->getParent() == I->getParent()) { + if (LastInst->comesBefore(I)) + LastInst = I; + continue; + } + assert(isVectorLikeInstWithConstOps(LastInst) && + isVectorLikeInstWithConstOps(I) && + "Expected vector-like insts only."); + if (!DT->isReachableFromEntry(LastInst->getParent())) { + LastInst = I; + continue; + } + if (!DT->isReachableFromEntry(I->getParent())) + continue; + auto *NodeA = DT->getNode(LastInst->getParent()); + auto *NodeB = DT->getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) LastInst = I; } + BB = LastInst->getParent(); return LastInst; }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="-avx512pf,+avx512f,+avx512bw" -slp-threshold=-100 -slp-min-tree-size=0 < %s | FileCheck %s + +define i32 @foo(i32 %a) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[SHUFFLE13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE13]]) +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[TMP6]], 0 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX14]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: ret i32 0 +; CHECK: bb4: +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SHUFFLE9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE9]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP9]], 0 +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[TMP10]], [[OP_RDX11]] +; CHECK-NEXT: ret i32 [[OP_RDX12]] +; CHECK: bb5: +; CHECK-NEXT: br label [[BB4:%.*]] +; +entry: + %0 = sub nsw i32 0, %a + %local = sub nsw i32 0, 0 + br i1 false, label %bb5, label %bb1 + +bb1: + %1 = add i32 %0, %local + %2 = add i32 %1, 0 + %3 = add i32 %2, %local + %4 = add i32 %3, 0 + %5 = add i32 %4, %local + br label %bb3 + +bb2: + br label %bb3 + +bb3: + %p1 = phi i32 [ %5, %bb1 ], [ 0, %bb2 ] + ret i32 0 + +bb4: + %6 = add i32 %0, %local + %7 = add i32 %6, %local + %8 = add i32 %7, 0 + %9 = add i32 %8, %local + %10 = add i32 %9, 0 + %11 = add i32 %10, %local + %12 = add i32 %11, 0 + %13 = add i32 %12, %local + %14 = add i32 %13, 0 + %15 = add i32 %14, %local + %16 = add i32 %15, 0 + %17 = add i32 %16, %local + %18 = add i32 %17, 0 + %19 = add i32 %18, %local + ret i32 %19 + +bb5: + br label %bb4 +}