diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5922,10 +5922,17 @@ const unsigned VF = 0; bool IsFinalized = false; SmallVector Mask; + /// Holds all of the instructions that we gathered. + SetVector &GatherShuffleSeq; + /// A list of blocks that we are going to CSE. + SetVector &CSEBlocks; public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) - : Builder(Builder), VF(VF) {} + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, + SetVector &GatherShuffleSeq, + SetVector &CSEBlocks) + : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), + CSEBlocks(CSEBlocks) {} /// Adds a mask, inverting it before applying. void addInversedMask(ArrayRef SubMask) { @@ -5955,7 +5962,12 @@ if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); + Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; } ~ShuffleInstructionBuilder() { @@ -6013,6 +6025,10 @@ std::iota(UniformMask.begin(), UniformMask.end(), 0); V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } return V; } @@ -6060,15 +6076,12 @@ VL = UniqueValues; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } return Vec; } @@ -6083,7 +6096,8 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); if (E->State == TreeEntry::NeedToGather) { if (E->getMainOp()) setInsertPointAfterBundle(E); @@ -6097,16 +6111,16 @@ "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } else { Vec = gather(E->Scalars); } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = Vec; return Vec; @@ -6223,8 +6237,13 @@ IsIdentity &= *InsertIdx - Offset == I; Mask[*InsertIdx - Offset] = I; } - if (!IsIdentity || NumElts != NumScalars) + if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } if ((!IsIdentity || Offset != 0 || !isUndefVector(FirstInsert->getOperand(0))) && @@ -6239,6 +6258,10 @@ V = Builder.CreateShuffleVector( FirstInsert->getOperand(0), V, InsertMask, cast(E->Scalars.back())->getName()); + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } ++NumVectorInstructions; @@ -6621,8 +6644,11 @@ propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, Mask); - if (Instruction *I = dyn_cast(V)) + if (auto *I = dyn_cast(V)) { V = propagateMetadata(I, E->Scalars); + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6863,7 +6889,50 @@ return A->getDFSNumIn() < B->getDFSNumIn(); }); - // Perform O(N^2) search over the gather sequences and merge identical + // Less defined shuffles can be replaced by the more defined copies. + // Between two shuffles one is less defined if it has the same vector operands + // and its mask indeces are the same as in the first one or undefs. E.g. + // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, + // poison, <0, 0, 0, 0>. + auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, + SmallVectorImpl &NewMask) { + if (I1->getType() != I2->getType()) + return false; + auto *SI1 = dyn_cast(I1); + auto *SI2 = dyn_cast(I2); + if (!SI1 || !SI2) + return I1->isIdenticalTo(I2); + if (SI1->isIdenticalTo(SI2)) + return true; + for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) + if (SI1->getOperand(I) != SI2->getOperand(I)) + return false; + // Check if the second instruction is more defined than the first one. + NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); + ArrayRef SM1 = SI1->getShuffleMask(); + // Count trailing undefs in the mask to check the final number of used + // registers. + unsigned LastUndefsCnt = 0; + for (int I = 0, E = NewMask.size(); I < E; ++I) { + if (SM1[I] == UndefMaskElem) + ++LastUndefsCnt; + else + LastUndefsCnt = 0; + if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + NewMask[I] != SM1[I]) + return false; + if (NewMask[I] == UndefMaskElem) + NewMask[I] = SM1[I]; + } + // Check if the last undefs actually change the final number of used vector + // registers. + return SM1.size() - LastUndefsCnt > 1 && + TTI->getNumberOfParts(SI1->getType()) == + TTI->getNumberOfParts( + FixedVectorType::get(SI1->getType()->getElementType(), + SM1.size() - LastUndefsCnt)); + }; + // Perform O(N^2) search over the gather/shuffle sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector Visited; @@ -6883,11 +6952,29 @@ // Check if we can replace this instruction with any of the // visited instructions. bool Replaced = false; - for (Instruction *v : Visited) { - if (In.isIdenticalTo(v) && - DT->dominates(v->getParent(), In.getParent())) { - In.replaceAllUsesWith(v); + for (Instruction *&V : Visited) { + SmallVector NewMask; + if (IsIdenticalOrLessDefined(&In, V, NewMask) && + DT->dominates(V->getParent(), In.getParent())) { + In.replaceAllUsesWith(V); eraseInstruction(&In); + if (auto *SI = dyn_cast(V)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + Replaced = true; + break; + } + if (isa(In) && isa(V) && + GatherShuffleSeq.contains(V) && + IsIdenticalOrLessDefined(V, &In, NewMask) && + DT->dominates(In.getParent(), V->getParent())) { + In.moveAfter(V); + V->replaceAllUsesWith(&In); + eraseInstruction(V); + if (auto *SI = dyn_cast(&In)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + V = &In; Replaced = true; break; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll @@ -9,8 +9,7 @@ ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 @@ -39,9 +38,8 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 @@ -70,9 +68,8 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll @@ -7,15 +7,15 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> , i16 [[TMP0:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP5]], <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY92:%.*]] ; CHECK: for.body92: ; CHECK-NEXT: [[SUM_MVR_I:%.*]] = getelementptr i32, i32* undef, i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[SUM_MVR_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 2 ; CHECK-NEXT: [[SUM_MVC_I:%.*]] = getelementptr i32, i32* undef, i32 1 ; CHECK-NEXT: [[SUM_MVC_ABS_I:%.*]] = getelementptr i32, i32* undef, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[SUM_MVR_I]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 8