diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9590,6 +9590,10 @@ // process to keep correct order. return Delayed; } + if (any_of(Entries, + [&](const TreeEntry *E) { return PostponedGathers.count(E); })) + PostponedGathers.insert(E); + assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); if (*GatherShuffle == TTI::SK_PermuteSingleSrc && diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s + +define void @"foo"() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: bci_0: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) null, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[BCI_252:%.*]] +; CHECK: bci_252: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[BCI_0:%.*]] ], [ [[TMP16:%.*]], [[BCI_252_1:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = or <2 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: br i1 false, label [[NOT_ZERO70:%.*]], label [[BCI_252_1]] +; CHECK: bci_252.1: +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <2 x i32> [[TMP2]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i32> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP16]] = or <2 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: br label [[BCI_252]] +; CHECK: not_zero70: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ [[TMP9]], [[BCI_252]] ] +; CHECK-NEXT: ret void +; +bci_0: + %0 = load i32, ptr addrspace(1) null, align 8 + br label %bci_252 + +bci_252: + %1 = phi i32 [ 0, %bci_0 ], [ %20, %bci_252.1 ] + %2 = phi i32 [ 0, %bci_0 ], [ %15, %bci_252.1 ] + %3 = mul i32 %0, 0 + %4 = or i32 %0, %3 + %5 = or i32 %4, 0 + %.neg91.neg = or i32 %2, 0 + %.neg446 = or i32 %.neg91.neg, %5 + %6 = or i32 %.neg446, 0 + %7 = mul i32 0, 0 + %8 = or i32 %0, %7 + %9 = or i32 %8, 0 + %.neg91.1.neg = or i32 %1, 0 + %.neg448 = or i32 %.neg91.1.neg, %9 + %10 = or i32 %.neg448, 0 + br i1 false, label %not_zero70, label %bci_252.1 + +bci_252.1: + %11 = or i32 %0, 0 + %12 = mul i32 %11, 0 + %13 = or i32 %0, %12 + %14 = or i32 %13, 0 + %.neg91.neg.1 = or i32 %6, 0 + %.neg446.1 = or i32 %.neg91.neg.1, %14 + %15 = or i32 %.neg446.1, 0 + %16 = or i32 %0, 0 + %17 = mul i32 %16, 0 + %18 = or i32 %0, %17 + %19 = or i32 %18, 0 + %.neg91.1.neg.1 = or i32 %10, 0 + %.neg448.1 = or i32 %.neg91.1.neg.1, %19 + %20 = or i32 %.neg448.1, 0 + br label %bci_252 + +not_zero70: + %.lcssa546 = phi i32 [ %6, %bci_252 ] + %.lcssa545 = phi i32 [ %10, %bci_252 ] + ret void +} + +