diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1680,6 +1680,28 @@ return IsSame(Scalars, ReuseShuffleIndices); } + /// \returns true if current entry has same operands as \p TE. + bool hasEqualOperands(const TreeEntry &TE) const { + if (TE.getNumOperands() != getNumOperands()) + return false; + SmallBitVector Used(getNumOperands()); + for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { + unsigned PrevCount = Used.count(); + for (unsigned K = 0; K < E; ++K) { + if (Used.test(K)) + continue; + if (getOperand(K) == TE.getOperand(I)) { + Used.set(K); + break; + } + } + // Check if we actually found the matching operand. + if (PrevCount == Used.count()) + return false; + } + return true; + } + /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. @@ -1773,6 +1795,12 @@ return Operands[OpIdx]; } + /// \returns the \p OpIdx operand of this TreeEntry. + ArrayRef getOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } @@ -2078,7 +2106,7 @@ SmallPtrSet EphValues; /// Holds all of the instructions that we gathered. - SetVector GatherSeq; + SetVector GatherShuffleSeq; /// A list of blocks that we are going to CSE. SetVector CSEBlocks; @@ -5016,7 +5044,30 @@ // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - if (Instruction::isBinaryOp(E->getOpcode())) { + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; + } + return false; + }; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); @@ -5728,7 +5779,7 @@ auto *InsElt = dyn_cast(Vec); if (!InsElt) return Vec; - GatherSeq.insert(InsElt); + GatherShuffleSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -5915,7 +5966,7 @@ ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); if (auto *I = dyn_cast(Vec)) { - GatherSeq.insert(I); + GatherShuffleSeq.insert(I); CSEBlocks.insert(I->getParent()); } } @@ -5953,7 +6004,7 @@ ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); if (auto *I = dyn_cast(Vec)) { - GatherSeq.insert(I); + GatherShuffleSeq.insert(I); CSEBlocks.insert(I->getParent()); } } @@ -6444,6 +6495,14 @@ V1 = Builder.CreateCast( static_cast(E->getAltOpcode()), LHS, VecTy); } + // Add V0 and V1 to later analysis to try to find and remove matching + // instruction, if any. + for (Value *V : {V0, V1}) { + if (auto *I = dyn_cast(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to @@ -6657,10 +6716,10 @@ } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { + for (Instruction *I : GatherShuffleSeq) { if (isDeleted(I)) continue; @@ -6719,7 +6778,7 @@ if (isDeleted(&In)) continue; if (!isa(&In) && !isa(&In) && - !isa(&In)) + !isa(&In) && !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the @@ -6741,7 +6800,7 @@ } } CSEBlocks.clear(); - GatherSeq.clear(); + GatherShuffleSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -9,11 +9,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -39,12 +37,10 @@ ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 @@ -73,11 +69,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -111,11 +105,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -224,17 +216,15 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -9,11 +9,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -39,12 +37,10 @@ ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 @@ -73,11 +69,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -111,11 +105,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -224,17 +216,15 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: build_vec_v2i64 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-8' +; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '8' @@ -17,11 +17,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP5]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1