diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2899,7 +2899,8 @@ } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); + if (getTreeEntry(V)) + continue; ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -5780,7 +5781,8 @@ // Check that none of the instructions in the bundle are already in the tree. for (Value *V : VL) { - if (!IsScatterVectorizeUserTE && !isa(V)) + if ((!IsScatterVectorizeUserTE && !isa(V)) || + doesNotNeedToBeScheduled(V)) continue; if (getTreeEntry(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V @@ -7525,6 +7527,12 @@ unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); const unsigned Sz = VL.size(); + SmallBitVector UsedScalars(Sz, false); + for (unsigned I = 0; I < Sz; ++I) { + if (getTreeEntry(VL[I]) == E) + continue; + UsedScalars.set(I); + } auto GetCostDiff = [=](function_ref ScalarEltCost, function_ref VectorCost) { @@ -7534,10 +7542,13 @@ // For some of the instructions no need to calculate cost for each // particular instruction, we can use the cost of the single // instruction x total number of scalar instructions. - ScalarCost = Sz * ScalarEltCost(0); + ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0); } else { - for (unsigned I = 0; I < Sz; ++I) + for (unsigned I = 0; I < Sz; ++I) { + if (UsedScalars.test(I)) + continue; ScalarCost += ScalarEltCost(I); + } } InstructionCost VecCost = VectorCost(CommonCost); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -6,23 +6,18 @@ ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <4 x ptr> [[SHUFFLE]], <4 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> , <4 x i64> poison) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[ARG_1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 8 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[ARG]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[SHUFFLE1]], <2 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[ARG]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP6]], <4 x ptr> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x ptr> [[TMP8]], ptr [[ARG_1]], i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP9]], i32 8, <4 x i1> , <4 x i64> poison) +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x ptr> poison, ptr [[ARG]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <4 x ptr> [[TMP6]], <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP7]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP3]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i64> [ [[TMP4]], [[IF]] ], [ [[TMP8]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else