diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8366,13 +8366,13 @@ } /// Attempt to vectorize the tree found by matchAssociativeReduction. - bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); if (NumReducedVals < 4) - return false; + return nullptr; // Intersect the fast-math-flags from all reduction operations. FastMathFlags RdxFMF; @@ -8473,7 +8473,7 @@ InstructionCost Cost = TreeCost + ReductionCost; if (!Cost.isValid()) { LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); - return false; + return nullptr; } if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { @@ -8553,7 +8553,7 @@ // vector reductions. V.eraseInstructions(IgnoreList); } - return VectorizedTree != nullptr; + return VectorizedTree; } unsigned numReductionValues() const { return ReducedVals.size(); } @@ -8839,32 +8839,45 @@ // Skip the analysis of CmpInsts.Compiler implements postanalysis of the // CmpInsts so we can skip extra attempts in // tryToVectorizeHorReductionOrInstOperands and save compile time. - SmallVector, 8> Stack(1, {Root, 0}); + std::queue> Stack; + Stack.emplace(Root, 0); SmallPtrSet VisitedInstrs; + SmallVector PostponedInsts; bool Res = false; + auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0, + Value *&B1) -> Value * { + bool IsBinop = matchRdxBop(Inst, B0, B1); + bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); + if (IsBinop || IsSelect) { + HorizontalReduction HorRdx; + if (HorRdx.matchAssociativeReduction(P, Inst)) + return HorRdx.tryToReduce(R, TTI); + } + return nullptr; + }; while (!Stack.empty()) { Instruction *Inst; unsigned Level; - std::tie(Inst, Level) = Stack.pop_back_val(); + std::tie(Inst, Level) = Stack.front(); + Stack.pop(); // Do not try to analyze instruction that has already been vectorized. // This may happen when we vectorize instruction operands on a previous // iteration while stack was populated before that happened. if (R.isDeleted(Inst)) continue; - Value *B0, *B1; - bool IsBinop = matchRdxBop(Inst, B0, B1); - bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); - if (IsBinop || IsSelect) { - HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst)) { - if (HorRdx.tryToReduce(R, TTI)) { - Res = true; - // Set P to nullptr to avoid re-analysis of phi node in - // matchAssociativeReduction function unless this is the root node. - P = nullptr; - continue; - } + Value *B0 = nullptr, *B1 = nullptr; + if (Value *V = TryToReduce(Inst, B0, B1)) { + Res = true; + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + if (auto *I = dyn_cast(V)) { + // Try to find another reduction. + Stack.emplace(I, Level); + continue; } + } else { + bool IsBinop = B0 && B1; if (P && IsBinop) { Inst = dyn_cast(B0); if (Inst == P) @@ -8881,10 +8894,10 @@ // matchAssociativeReduction function unless this is the root node. P = nullptr; // Do not try to vectorize CmpInst operands, this is done separately. - if (!isa(Inst) && Vectorize(Inst, R)) { - Res = true; - continue; - } + // Final attempt for binop args vectorization should happen after the loop + // to try to find reductions. + if (!isa(Inst)) + PostponedInsts.push_back(Inst); // Try to vectorize operands. // Continue analysis for the instruction from the same basic block only to @@ -8897,8 +8910,13 @@ // separately. if (!isa(I) && !isa(I) && !R.isDeleted(I) && I->getParent() == BB) - Stack.emplace_back(I, Level); + Stack.emplace(I, Level); } + // Try to vectorized binops where reductions were not found. + for (Value *V : PostponedInsts) + if (auto *Inst = dyn_cast(V)) + if (!R.isDeleted(Inst)) + Res |= Vectorize(Inst, R); return Res; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll @@ -30,39 +30,40 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[MUL7:%.*]] = fmul float [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[DOT_115]], [[MUL7]] -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX10]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[MUL14:%.*]] = fmul float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[ADD]], [[MUL14]] -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP5]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX6]] to <2 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[DOT_115]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[ADD]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>* +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>* +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; CHECK-NEXT: [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX37]], align 4 +; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP18]], [[TMP19]] ; CHECK-NEXT: [[ADD39]] = fadd float [[ADD31]], [[MUL38]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -16,22 +16,23 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 3 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 3 -; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4 -; CHECK-NEXT: [[X1:%.*]] = load double, double* [[PTRX1]], align 4 -; CHECK-NEXT: [[Y1:%.*]] = load double, double* [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX2]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[MUL1]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[PTRX2]] to <2 x double>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[PTRY2]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 +; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 +; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP14]] ; CHECK-NEXT: ret double [[DOT0123]] ; %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1 @@ -66,22 +67,23 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 3 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 3 -; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4 -; CHECK-NEXT: [[X1:%.*]] = load float, float* [[PTRX1]], align 4 -; CHECK-NEXT: [[Y1:%.*]] = load float, float* [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX2]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY2]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; CHECK-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[MUL1]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[PTRX2]] to <2 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[PTRY2]] to <2 x float>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP14]] ; CHECK-NEXT: ret float [[DOT0123]] ; %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -142,17 +142,19 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 +; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 +; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 +; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q5]], i32 1 -; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -177,17 +177,19 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 +; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 +; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q1]], i32 1 ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 +; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q3]], i32 1 ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q5]], i32 1 -; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll @@ -22,25 +22,21 @@ ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 4 ; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 5 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 6 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[I1]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[I2]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 16 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[I3]] to <2 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> undef, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP9]], [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP11]], undef -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[I11:%.*]] = add i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 -; CHECK-NEXT: [[I18:%.*]] = add i32 [[TMP15]], [[I11]] -; CHECK-NEXT: [[I19:%.*]] = add i32 [[TMP15]], [[I18]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: [[I5:%.*]] = add i32 undef, undef +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[TMP2]], [[I5]] +; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_EXTRA2]], undef +; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_EXTRA2]], [[I10]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[I12:%.*]] = add i32 undef, undef +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], [[I12]] +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_EXTRA1]], [[I11]] +; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_EXTRA1]], [[I18]] ; CHECK-NEXT: [[I20:%.*]] = add i32 undef, [[I19]] ; CHECK-NEXT: [[I21:%.*]] = add i32 undef, [[I20]] ; CHECK-NEXT: [[I22:%.*]] = add i32 undef, [[I21]]