diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7084,6 +7084,94 @@ } // end anonymous namespace +static Optional getAggregateSize(Instruction *InsertInst) { + if (auto *IE = dyn_cast(InsertInst)) + return cast(IE->getType())->getNumElements(); + + unsigned AggregateSize = 1; + auto *IV = cast(InsertInst); + Type *CurrentType = IV->getType(); + do { + if (auto *ST = dyn_cast(CurrentType)) { + for (auto *Elt : ST->elements()) + if (Elt != ST->getElementType(0)) // check homogeneity + return None; + AggregateSize *= ST->getNumElements(); + CurrentType = ST->getElementType(0); + } else if (auto *AT = dyn_cast(CurrentType)) { + AggregateSize *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else if (auto *VT = dyn_cast(CurrentType)) { + AggregateSize *= VT->getNumElements(); + return AggregateSize; + } else if (CurrentType->isSingleValueType()) { + return AggregateSize; + } else { + return None; + } + } while (true); +} + +static Optional getOperandIndex(Instruction *InsertInst, + unsigned OperandOffset) { + unsigned OperandIndex = OperandOffset; + if (auto *IE = dyn_cast(InsertInst)) { + if (auto *CI = dyn_cast(IE->getOperand(2))) { + auto *VT = cast(IE->getType()); + OperandIndex *= VT->getNumElements(); + OperandIndex += CI->getZExtValue(); + return OperandIndex; + } + return None; + } + + auto *IV = cast(InsertInst); + Type *CurrentType = IV->getType(); + for (unsigned int Index : IV->indices()) { + if (auto *ST = dyn_cast(CurrentType)) { + OperandIndex *= ST->getNumElements(); + CurrentType = ST->getElementType(Index); + } else if (auto *AT = dyn_cast(CurrentType)) { + OperandIndex *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else { + return None; + } + OperandIndex += Index; + } + return OperandIndex; +} + +static bool findBuildAggregate_rec(Instruction *LastInsertInst, + TargetTransformInfo *TTI, + SmallVectorImpl &BuildVectorOpds, + SmallVectorImpl &InsertElts, + unsigned OperandOffset) { + do { + Value *InsertedOperand = LastInsertInst->getOperand(1); + Optional OperandIndex = + getOperandIndex(LastInsertInst, OperandOffset); + if (!OperandIndex) + return false; + if (isa(InsertedOperand) || + isa(InsertedOperand)) { + if (!findBuildAggregate_rec(cast(InsertedOperand), TTI, + BuildVectorOpds, InsertElts, *OperandIndex)) + return false; + } else { + BuildVectorOpds[*OperandIndex] = InsertedOperand; + InsertElts[*OperandIndex] = LastInsertInst; + } + if (isa(LastInsertInst->getOperand(0))) + return true; + LastInsertInst = dyn_cast(LastInsertInst->getOperand(0)); + } while (LastInsertInst != nullptr && + (isa(LastInsertInst) || + isa(LastInsertInst)) && + LastInsertInst->hasOneUse()); + return false; +} + /// Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 @@ -7091,54 +7179,41 @@ /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 /// starting from the last insertelement or insertvalue instruction. /// -/// Also recognize aggregates like {<2 x float>, <2 x float>}, +/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. /// /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, +static bool findBuildAggregate(Instruction *LastInsertInst, + TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, SmallVectorImpl &InsertElts) { + assert((isa(LastInsertInst) || isa(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"); - do { - Value *InsertedOperand; - auto *IE = dyn_cast(LastInsertInst); - if (IE) { - InsertedOperand = IE->getOperand(1); - LastInsertInst = IE->getOperand(0); - } else { - auto *IV = cast(LastInsertInst); - InsertedOperand = IV->getInsertedValueOperand(); - LastInsertInst = IV->getAggregateOperand(); - } - if (isa(InsertedOperand) || - isa(InsertedOperand)) { - SmallVector TmpBuildVectorOpds; - SmallVector TmpInsertElts; - if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpInsertElts)) - return false; - BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), - TmpBuildVectorOpds.rend()); - InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend()); - } else { - BuildVectorOpds.push_back(InsertedOperand); - InsertElts.push_back(IE); - } - if (isa(LastInsertInst)) - break; - if ((!isa(LastInsertInst) && - !isa(LastInsertInst)) || - !LastInsertInst->hasOneUse()) - return false; - } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); - std::reverse(InsertElts.begin(), InsertElts.end()); - return true; + + assert((BuildVectorOpds.empty() && InsertElts.empty()) && + "Expected empty result vectors!"); + + Optional AggregateSize = getAggregateSize(LastInsertInst); + if (!AggregateSize) + return false; + BuildVectorOpds.resize(*AggregateSize); + InsertElts.resize(*AggregateSize); + + if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, + 0)) { + llvm::erase_if(BuildVectorOpds, + [](const Value *V) { return V == nullptr; }); + llvm::erase_if(InsertElts, [](const Value *V) { return V == nullptr; }); + if (BuildVectorOpds.size() >= 2) + return true; + } + + return false; } static bool PhiTypeSorterFunc(Value *V, Value *V2) { @@ -7308,8 +7383,7 @@ SmallVector BuildVectorOpds; SmallVector BuildVectorInsts; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) || - BuildVectorOpds.size() < 2) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); @@ -7324,7 +7398,6 @@ SmallVector BuildVectorInsts; SmallVector BuildVectorOpds; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - BuildVectorOpds.size() < 2 || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && isShuffle(BuildVectorOpds))) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll @@ -37,11 +37,10 @@ define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -126,13 +126,16 @@ ; doesn't matter define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; ANY-LABEL: @simple_select_insert_out_of_order( -; ANY-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer -; ANY-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] -; ANY-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; ANY-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <4 x i32> +; ANY-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; ANY-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> +; ANY-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; ANY-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[REORDER_SHUFFLE1]], <4 x float> [[REORDER_SHUFFLE2]] +; ANY-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 ; ANY-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2 ; ANY-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 ; ANY-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; ANY-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; ANY-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; ANY-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0 ; ANY-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 ; ANY-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 @@ -447,19 +450,19 @@ ; Make sure we handle multiple trees that feed one build vector correctly. define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) { ; ANY-LABEL: @multi_tree( -; ANY-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0 -; ANY-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[X:%.*]], i32 1 -; ANY-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Y:%.*]], i32 2 -; ANY-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[Z:%.*]], i32 3 -; ANY-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], +; ANY-NEXT: [[TMP1:%.*]] = insertelement <4 x double> undef, double [[Z:%.*]], i32 0 +; ANY-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 1 +; ANY-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[X:%.*]], i32 2 +; ANY-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3 +; ANY-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; ANY-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], -; ANY-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 +; ANY-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 ; ANY-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3 -; ANY-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 +; ANY-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 ; ANY-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2 -; ANY-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 +; ANY-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 ; ANY-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1 -; ANY-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 +; ANY-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 ; ANY-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0 ; ANY-NEXT: ret <4 x double> [[I4]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll @@ -147,13 +147,13 @@ define {%StructTy, float, float} @NonHomogeneousStruct(float *%Ptr) { ; CHECK-LABEL: @NonHomogeneousStruct( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[GEP0]] +; CHECK-NEXT: [[L0:%.*]] = load float, float* [[GEP0]], align 4 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1 -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[GEP1]] +; CHECK-NEXT: [[L1:%.*]] = load float, float* [[GEP1]], align 4 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2 -; CHECK-NEXT: [[L2:%.*]] = load float, float* [[GEP2]] +; CHECK-NEXT: [[L2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3 -; CHECK-NEXT: [[L3:%.*]] = load float, float* [[GEP3]] +; CHECK-NEXT: [[L3:%.*]] = load float, float* [[GEP3]], align 4 ; CHECK-NEXT: [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01 ; CHECK-NEXT: [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01 ; CHECK-NEXT: [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll @@ -8,11 +8,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 ; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 ; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0 ; CHECK-NEXT: ret <2 x float> [[INS0]] ; @@ -44,23 +43,22 @@ ; CHECK-NEXT: [[GEP7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 7 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP0]] to <8 x i16>* ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[REORDER_SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 ; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] %StructIn0, i16 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 ; CHECK-NEXT: [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 ; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] %StructIn2, i16 [[TMP7]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 ; CHECK-NEXT: [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 ; CHECK-NEXT: [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] %StructIn4, i16 [[TMP9]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 ; CHECK-NEXT: [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 ; CHECK-NEXT: [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] %StructIn6, i16 [[TMP11]], 0 ; CHECK-NEXT: [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] %StructIn1, 0 ; CHECK-NEXT: [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In0, [[STRUCT1TY]] %StructIn3, 1