diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8363,7 +8363,8 @@ } } } else { - Builder.SetInsertPoint(cast(User)); + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); Value *NewInst = ExtractAndExtendIfNeeded(Vec); CSEBlocks.insert(cast(User)->getParent()); User->replaceUsesOfWith(Scalar, NewInst); diff --git a/llvm/test/Transforms/SLPVectorizer/pr55796.ll b/llvm/test/Transforms/SLPVectorizer/pr55796.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/pr55796.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S < %s | FileCheck %s +; target triple = "x86_64-unknown-linux-gnu" + +; REQUIRES: asserts + +define i32 @"foo"(i8 addrspace(3)* %tmp_buf) #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: always_continue: +; CHECK-NEXT: [[LOCAL_TMP_6:%.*]] = load i32, i32 addrspace(3)* undef, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_TMP_6]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[LOCAL_TMP_6]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] +; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE10]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[SHUFFLE10]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[SHUFFLE14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: br i1 undef, label [[GUARDED209_7:%.*]], label [[DEOPT210_SPLIT_LOOP_EXIT1140:%.*]] +; CHECK: deopt210.split.loop.exit1140: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[SHUFFLE14]]) +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br label [[DEOPT210:%.*]] +; CHECK: deopt210.split.loop.exit1164: +; CHECK-NEXT: br label [[DEOPT210]] +; CHECK: deopt210: +; CHECK-NEXT: [[LOCAL_7_3370_LCSSA1006:%.*]] = phi i32 [ [[OP_RDX15]], [[DEOPT210_SPLIT_LOOP_EXIT1140]] ], [ undef, [[DEOPT210_SPLIT_LOOP_EXIT1164:%.*]] ] +; CHECK-NEXT: ret i32 undef +; CHECK: deopt215.split.loop.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE10]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[TMP9]], undef +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX11]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[TMP12]], [[OP_RDX12]] +; CHECK-NEXT: [[DEOPTCALL216:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32(i32 12) [ "deopt"(i32 0, i32 1, i32 0, i32 142, i32 4, i32 19, i32 1, i32 0, i8 addrspace(1)* undef, i32 3, i32 undef, i32 0, i8 addrspace(1)* undef, i32 3, i32 undef, i32 4, i64 undef, i32 7, i8* null, i32 4, i64 undef, i32 7, i8* null, i32 4, double undef, i32 7, i8* null, i32 3, i32 [[LOCAL_TMP_6]], i32 3, i32 [[OP_RDX13]], i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 7, i8* null, i32 3, i32 undef, i32 3, i32 undef, i32 3, float undef, i32 3, i32 undef, i32 0, i8 addrspace(1)* undef, i32 7, i8* null, i32 7, i8* null, i32 0, i8 addrspace(1)* undef) ] +; CHECK-NEXT: ret i32 [[DEOPTCALL216]] +; CHECK: guarded209.7: +; CHECK-NEXT: br label [[DEOPT215_SPLIT_LOOP_EXIT:%.*]] +; +always_continue: + %local.tmp.6 = load i32, i32 addrspace(3)* undef, align 4 + %0 = sub nsw i32 110, %local.tmp.6 + %local_6_.lcssa.neg = sub nsw i32 0, %local.tmp.6 + br i1 undef, label %guarded209.7, label %deopt210.split.loop.exit1140 + +deopt210.split.loop.exit1140: ; preds = %always_continue + %1 = add i32 0, %0 + %2 = add i32 %1, %local_6_.lcssa.neg + %3 = add i32 %2, 90 + %4 = add i32 %3, %local_6_.lcssa.neg + %5 = add i32 %4, 72 + %6 = add i32 %5, %local_6_.lcssa.neg + %7 = add i32 %6, 56 + %8 = add i32 %7, %local_6_.lcssa.neg + %9 = add i32 %8, 42 + %10 = add i32 %9, %local_6_.lcssa.neg + %11 = add i32 %10, 30 + %12 = add i32 %11, %local_6_.lcssa.neg + %13 = add i32 %12, 20 + %14 = add i32 %13, %local_6_.lcssa.neg + %15 = add i32 %14, 12 + br label %deopt210 + +deopt210.split.loop.exit1164: ; No predecessors! + br label %deopt210 + +deopt210: ; preds = %deopt210.split.loop.exit1164, %deopt210.split.loop.exit1140 + %local_7_3370.lcssa1006 = phi i32 [ %15, %deopt210.split.loop.exit1140 ], [ undef, %deopt210.split.loop.exit1164 ] + ret i32 undef + +deopt215.split.loop.exit: ; preds = %guarded209.7 + %16 = add i32 undef, %0 + %17 = add i32 %16, %local_6_.lcssa.neg + %18 = add i32 %17, 90 + %19 = add i32 %18, %local_6_.lcssa.neg + %20 = add i32 %19, 72 + %21 = add i32 %20, %local_6_.lcssa.neg + %22 = add i32 %21, 56 + %23 = add i32 %22, %local_6_.lcssa.neg + %24 = add i32 %23, 42 + %25 = add i32 %24, %local_6_.lcssa.neg + %26 = add i32 %25, 30 + %27 = add i32 %26, %local_6_.lcssa.neg + %28 = add i32 %27, 20 + %29 = add i32 %28, %local_6_.lcssa.neg + %30 = add i32 %29, 12 + %31 = add i32 %30, %local_6_.lcssa.neg + %32 = add i32 %31, 6 + %deoptcall216 = call i32 (...) @llvm.experimental.deoptimize.i32(i32 12) [ "deopt"(i32 0, i32 1, i32 0, i32 142, i32 4, i32 19, i32 1, i32 0, i8 addrspace(1)* undef, i32 3, i32 undef, i32 0, i8 addrspace(1)* undef, i32 3, i32 undef, i32 4, i64 undef, i32 7, i8* null, i32 4, i64 undef, i32 7, i8* null, i32 4, double undef, i32 7, i8* null, i32 3, i32 %local.tmp.6, i32 3, i32 %32, i32 3, i32 undef, i32 3, i32 undef, i32 3, i32 undef, i32 7, i8* null, i32 3, i32 undef, i32 3, i32 undef, i32 3, float undef, i32 3, i32 undef, i32 0, i8 addrspace(1)* undef, i32 7, i8* null, i32 7, i8* null, i32 0, i8 addrspace(1)* undef) ] + ret i32 %deoptcall216 + +guarded209.7: ; preds = %always_continue + br label %deopt215.split.loop.exit +} + + +declare i32 @llvm.experimental.deoptimize.i32(...) + +attributes #0 = { "target-features"="-avx512pf,+avx512f,+avx512bw" } +attributes #1 = { willreturn "target-features"="-avx512pf,+avx512f,+avx512bw" }