diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -93,11 +93,15 @@ bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R); /// Try to vectorize a list of operands. - /// \param UserCost Cost of the user operations of \p VL if they may affect - /// the cost of the vectorization. + /// When \p InsertUses is provided and its entries are non-zero + /// then users of \p VL are known to be InsertElement instructions + /// each associated with same VL entry index. Their cost is then + /// used to adjust cost of the vectorization assuming instcombine pass + /// then optimizes ExtractElement-InsertElement sequence. /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, - int UserCost = 0, bool AllowReorder = false); + bool AllowReorder = false, + ArrayRef InsertUses = None); /// Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5929,12 +5929,13 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; - Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, /*UserCost=*/0, true); + Value *VL[] = {A, B}; + return tryToVectorizeList(VL, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, - int UserCost, bool AllowReorder) { + bool AllowReorder, + ArrayRef InsertUses) { if (VL.size() < 2) return false; @@ -5983,6 +5984,13 @@ bool CandidateFound = false; int MinCost = SLPCostThreshold; + bool CompensateUseCost = + !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { + return V && isa(V); + }); + assert((!CompensateUseCost || InsertUses.size() == VL.size()) && + "Each scalar expected to have an associated InsertElement user."); + unsigned NextInst = 0, MaxInst = VL.size(); for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as @@ -6030,8 +6038,48 @@ continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost() - UserCost; + int Cost = R.getTreeCost(); CandidateFound = true; + if (CompensateUseCost) { + // TODO: Use TTI's getScalarizationOverhead for sequence of inserts + // rather than sum of single inserts as the latter may overestimate + // cost. This work should imply improving cost estimation for extracts + // that added in for external (for vectorization tree) users,i.e. that + // part should also switch to same interface. + // For example, the following case is projected code after SLP: + // %4 = extractelement <4 x i64> %3, i32 0 + // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 + // %5 = extractelement <4 x i64> %3, i32 1 + // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 + // %6 = extractelement <4 x i64> %3, i32 2 + // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 + // %7 = extractelement <4 x i64> %3, i32 3 + // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 + // + // Extracts here added by SLP in order to feed users (the inserts) of + // original scalars and contribute to "ExtractCost" at cost evaluation. + // The inserts in turn form sequence to build an aggregate that + // detected by findBuildAggregate routine. + // SLP makes an assumption that such sequence will be optimized away + // later (instcombine) so it tries to compensate ExctractCost with + // cost of insert sequence. + // Current per element cost calculation approach is not quite accurate + // and tends to create bias toward favoring vectorization. + // Switching to the TTI interface might help a bit. + // Alternative solution could be pattern-match to detect a no-op or + // shuffle. + unsigned UserCost = 0; + for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { + auto *IE = cast(InsertUses[I + Lane]); + if (auto *CI = dyn_cast(IE->getOperand(2))) + UserCost += TTI->getVectorInstrCost( + Instruction::InsertElement, IE->getType(), CI->getZExtValue()); + } + LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost + << ".\n"); + Cost -= UserCost; + } + MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { @@ -7047,48 +7095,16 @@ /// \return true if it matches. static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, - int &UserCost) { + SmallVectorImpl &InsertElts) { assert((isa(LastInsertInst) || isa(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - UserCost = 0; do { - // TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather - // than sum of single inserts as the latter may overestimate cost. - // This work should imply improving cost estimation for extracts that - // added in for external (for vectorization tree) users. - // For example, in following case all extracts added in order to feed - // into external users (inserts), which in turn form sequence to build - // an aggregate that we do match here: - // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 - // %5 = extractelement <4 x i64> %3, i32 1 - // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 - // %6 = extractelement <4 x i64> %3, i32 2 - // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 - // %7 = extractelement <4 x i64> %3, i32 3 - // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 - // - // Cost of this entire sequence is currently estimated as sum of single - // extracts (as this aggregate build sequence is an external to - // vectorization tree user) minus cost of the aggregate build. - // As this whole sequence will be optimized away we want the cost to be - // zero. But it is not quite possible using given approach (at least for - // X86) because inserts can be more expensive than extracts for longer - // vector lengths so the difference turns out not zero in such a case. - // Ideally we want to match this entire sequence and treat it as a no-op - // (i.e. do not count into final cost at all). - // Currently the difference tends to be negative thus adding a bias - // toward favoring vectorization. If we switch into using TTI interface - // the bias tendency will remain but will be lower. Value *InsertedOperand; - if (auto *IE = dyn_cast(LastInsertInst)) { + auto *IE = dyn_cast(LastInsertInst); + if (IE) { InsertedOperand = IE->getOperand(1); LastInsertInst = IE->getOperand(0); - if (auto *CI = dyn_cast(IE->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - IE->getType(), CI->getZExtValue()); - } } else { auto *IV = cast(LastInsertInst); InsertedOperand = IV->getInsertedValueOperand(); @@ -7096,16 +7112,17 @@ } if (isa(InsertedOperand) || isa(InsertedOperand)) { - int TmpUserCost; SmallVector TmpBuildVectorOpds; + SmallVector TmpInsertElts; if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpUserCost)) + TmpInsertElts)) return false; BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), TmpBuildVectorOpds.rend()); - UserCost += TmpUserCost; + InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend()); } else { BuildVectorOpds.push_back(InsertedOperand); + InsertElts.push_back(IE); } if (isa(LastInsertInst)) break; @@ -7115,6 +7132,7 @@ return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); + std::reverse(InsertElts.begin(), InsertElts.end()); return true; } @@ -7279,26 +7297,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { - int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector BuildVectorOpds; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) + SmallVector BuildVectorInsts; + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - int UserCost; + SmallVector BuildVectorInsts; SmallVector BuildVectorOpds; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || + BuildVectorOpds.size() < 2 || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && isShuffle(BuildVectorOpds))) @@ -7306,7 +7327,8 @@ // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, UserCost); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, + BuildVectorInsts); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -7384,8 +7406,8 @@ // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*UserCost=*/0, AllowReorder)) { + if (NumElts > 1 && + tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -153,15 +153,12 @@ ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] ; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 +; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 ; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 ; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] @@ -187,21 +184,25 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -56,20 +56,38 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> -; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] +; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], [[B3]] +; SLM-NEXT: [[AB4:%.*]] = fmul float [[A4]], [[B4]] +; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SLM-NEXT: ret <8 x float> [[R7]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -101,11 +101,44 @@ } define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @ashr_shl_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @ashr_shl_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @ashr_shl_v8i32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @ashr_shl_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R7]] +; +; AVX512-LABEL: @ashr_shl_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -179,13 +179,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -179,13 +179,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -111,15 +111,13 @@ ; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 +; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 +; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[T1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP6]], i32 1 +; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 +; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float ; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -6,8 +6,6 @@ ; Vectorization triggered by cost bias caused by subtracting ; the cost of entire "aggregate build" sequence while ; building vectorizable tree from only a portion of it. -; FIXME: this is unprofitable to vectorize. - define void @test(i32* nocapture %t2) { ; CHECK-LABEL: @test( @@ -29,9 +27,11 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] +; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]] ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] @@ -41,29 +41,16 @@ ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 6270, i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP15]], i32 3 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP12]], i32 4 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] +; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 +; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 +; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5 ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP15]], i32 7 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4