Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -940,6 +940,10 @@ /// applies when shouldMaximizeVectorBandwidth returns true. unsigned getMinimumVF(unsigned ElemWidth) const; + /// \return The maximum vectorization factor for types of given element + /// bit width and opcode, or 0 if there is no maximum VF. + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -1497,6 +1501,7 @@ virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; + virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() const = 0; @@ -1916,6 +1921,9 @@ unsigned getMinimumVF(unsigned ElemWidth) const override { return Impl.getMinimumVF(ElemWidth); } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override { + return Impl.getMaximumVF(ElemWidth, Opcode); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -356,6 +356,8 @@ unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -634,6 +634,11 @@ return TTIImpl->getMinimumVF(ElemWidth); } +unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth, + unsigned Opcode) const { + return TTIImpl->getMaximumVF(ElemWidth, Opcode); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -170,6 +170,7 @@ unsigned getNumberOfRegisters(unsigned RCID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,12 @@ return 32; } +unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + if (Opcode == Instruction::Load || Opcode == Instruction::Store) + return 32 * 4 / ElemWidth; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; +} + unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const { Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -126,6 +126,10 @@ MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt +MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, + cl::desc("Maximum vectorization factor")); + static cl::opt MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); @@ -741,6 +745,12 @@ return MinVecRegSize; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + unsigned MaxVF = MaxVFOption.getNumOccurrences() ? + MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); + return MaxVF ? MaxVF : UINT_MAX; + } + /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, @@ -6131,6 +6141,7 @@ unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); + MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -7573,7 +7584,6 @@ bool Changed = false; SmallVector Incoming; SmallPtrSet VisitedInstrs; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7600,27 +7610,8 @@ // Look for the next elements with the same type. SmallVector::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getType(); - - assert(EltTy->isSized() && - "Instructions should all be sized at this point"); - TypeSize EltTS = DL->getTypeSizeInBits(EltTy); - if (EltTS.isScalable()) { - // For now, just ignore vectorizing scalable types. - ++IncIt; - continue; - } - - unsigned EltSize = EltTS.getFixedSize(); - unsigned MaxNumElts = MaxVecRegSize / EltSize; - if (MaxNumElts < 2) { - ++IncIt; - continue; - } - while (SameTypeIt != E && - (*SameTypeIt)->getType() == EltTy && - static_cast(SameTypeIt - IncIt) < MaxNumElts) { + (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -123,12 +123,18 @@ ret <2 x i16> %ins.1 } -; FIXME: Should not vectorize define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @uadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -145,8 +151,15 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @usub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -163,8 +176,15 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @sadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -181,8 +201,15 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @ssub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -267,8 +294,14 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <4 x i16> [[TMP0]] +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) +; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_3]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -18,9 +18,9 @@ ret <2 x half> %tmp5 } -; TODO: Should probably not really be vectorizing this ; GCN-LABEL: @round_v2f32( -; GCN: call <2 x float> @llvm.round.v2f32 +; GCN: call float @llvm.round.f32( +; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { bb: %tmp = extractelement <2 x float> %arg, i64 0 Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-max-phi-size.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-max-phi-size.ll @@ -0,0 +1,130 @@ +; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | llc -march=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=GCN,MAX32 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | llc -march=amdgcn -mcpu=gfx900 | FileCheck -check-prefixes=GCN,MAX1024 %s + +; GCN-LABEL: {{^}}phi_float32: +; MAX32-NOT: Folded Spill +; GCN: flat_store_dword +define void @phi_float32(half %hval, float %fval) { +bb: + br label %bb1 + +bb1: + %i = fpext half %hval to float + %i1 = fmul float %i, %fval + %i2 = fadd float 0.000000e+00, %i1 + %i3 = fpext half %hval to float + %i4 = fmul float %i3, %fval + %i5 = fadd float 0.000000e+00, %i4 + %i6 = fpext half %hval to float + %i7 = fmul float %i6, %fval + %i8 = fadd float 0.000000e+00, %i7 + %i9 = fpext half %hval to float + %i10 = fmul float %i9, %fval + %i11 = fadd float 0.000000e+00, %i10 + %i12 = fmul float %i, %fval + %i13 = fadd float 0.000000e+00, %i12 + %i14 = fmul float %i3, %fval + %i15 = fadd float 0.000000e+00, %i14 + %i16 = fmul float %i6, %fval + %i17 = fadd float 0.000000e+00, %i16 + %i18 = fmul float %i9, %fval + %i19 = fadd float 0.000000e+00, %i18 + %i20 = fmul float %i, %fval + %i21 = fadd float 0.000000e+00, %i20 + %i22 = fmul float %i3, %fval + %i23 = fadd float 0.000000e+00, %i22 + %i24 = fmul float %i6, %fval + %i25 = fadd float 0.000000e+00, %i24 + %i26 = fmul float %i9, %fval + %i27 = fadd float 0.000000e+00, %i26 + %i28 = fmul float %i, %fval + %i29 = fadd float 0.000000e+00, %i28 + %i30 = fmul float %i3, %fval + %i31 = fadd float 0.000000e+00, %i30 + %i32 = fmul float %i6, %fval + %i33 = fadd float 0.000000e+00, %i32 + %i34 = fmul float %i9, %fval + %i35 = fadd float 0.000000e+00, %i34 + %i36 = fmul float %i, %fval + %i37 = fadd float 0.000000e+00, %i36 + %i38 = fmul float %i3, %fval + %i39 = fadd float 0.000000e+00, %i38 + %i40 = fmul float %i6, %fval + %i41 = fadd float 0.000000e+00, %i40 + %i42 = fmul float %i9, %fval + %i43 = fadd float 0.000000e+00, %i42 + %i44 = fmul float %i, %fval + %i45 = fadd float 0.000000e+00, %i44 + %i46 = fmul float %i3, %fval + %i47 = fadd float 0.000000e+00, %i46 + %i48 = fmul float %i6, %fval + %i49 = fadd float 0.000000e+00, %i48 + %i50 = fmul float %i9, %fval + %i51 = fadd float 0.000000e+00, %i50 + %i52 = fmul float %i, %fval + %i53 = fadd float 0.000000e+00, %i52 + %i54 = fmul float %i3, %fval + %i55 = fadd float 0.000000e+00, %i54 + %i56 = fmul float %i6, %fval + %i57 = fadd float 0.000000e+00, %i56 + %i58 = fmul float %i9, %fval + %i59 = fadd float 0.000000e+00, %i58 + %i60 = fmul float %i, %fval + %i61 = fadd float 0.000000e+00, %i60 + %i62 = fmul float %i3, %fval + %i63 = fadd float 0.000000e+00, %i62 + %i64 = fmul float %i6, %fval + %i65 = fadd float 0.000000e+00, %i64 + %i66 = fmul float %i9, %fval + %i67 = fadd float 0.000000e+00, %i66 + switch i32 undef, label %bb5 [ + i32 0, label %bb2 + i32 1, label %bb3 + i32 2, label %bb4 + ] + +bb3: + br label %bb2 + +bb4: + br label %bb2 + +bb5: + br label %bb2 + +bb2: + %phi1 = phi float [ %i19, %bb3 ], [ %i19, %bb4 ], [ %fval, %bb5 ], [ %i19, %bb1 ] + %phi2 = phi float [ %i17, %bb3 ], [ %fval, %bb4 ], [ %i17, %bb5 ], [ %i17, %bb1 ] + %phi3 = phi float [ %i15, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi4 = phi float [ %i13, %bb3 ], [ %i13, %bb4 ], [ %i13, %bb5 ], [ %fval, %bb1 ] + %phi5 = phi float [ %i11, %bb3 ], [ %i11, %bb4 ], [ %fval, %bb5 ], [ %i11, %bb1 ] + %phi6 = phi float [ %i8, %bb3 ], [ %fval, %bb4 ], [ %i8, %bb5 ], [ %i8, %bb1 ] + %phi7 = phi float [ %i5, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi8 = phi float [ %i2, %bb3 ], [ %i2, %bb4 ], [ %i2, %bb5 ], [ %fval, %bb1 ] + %phi9 = phi float [ %i21, %bb3 ], [ %i21, %bb4 ], [ %i21, %bb5 ], [ %fval, %bb1 ] + %phi10 = phi float [ %i23, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi11 = phi float [ %i25, %bb3 ], [ %fval, %bb4 ], [ %i25, %bb5 ], [ %i25, %bb1 ] + %phi12 = phi float [ %i27, %bb3 ], [ %i27, %bb4 ], [ %fval, %bb5 ], [ %i27, %bb1 ] + %phi13 = phi float [ %i29, %bb3 ], [ %i29, %bb4 ], [ %i29, %bb5 ], [ %fval, %bb1 ] + %phi14 = phi float [ %i31, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi15 = phi float [ %i33, %bb3 ], [ %fval, %bb4 ], [ %i33, %bb5 ], [ %i33, %bb1 ] + %phi16 = phi float [ %i35, %bb3 ], [ %i35, %bb4 ], [ %fval, %bb5 ], [ %i35, %bb1 ] + %phi17 = phi float [ %i37, %bb3 ], [ %i37, %bb4 ], [ %i37, %bb5 ], [ %fval, %bb1 ] + %phi18 = phi float [ %i39, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi19 = phi float [ %i41, %bb3 ], [ %fval, %bb4 ], [ %i41, %bb5 ], [ %i41, %bb1 ] + %phi20 = phi float [ %i43, %bb3 ], [ %i43, %bb4 ], [ %fval, %bb5 ], [ %i43, %bb1 ] + %phi21 = phi float [ %i45, %bb3 ], [ %i45, %bb4 ], [ %i45, %bb5 ], [ %fval, %bb1 ] + %phi22 = phi float [ %i47, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi23 = phi float [ %i49, %bb3 ], [ %fval, %bb4 ], [ %i49, %bb5 ], [ %i49, %bb1 ] + %phi24 = phi float [ %i51, %bb3 ], [ %i51, %bb4 ], [ %fval, %bb5 ], [ %i51, %bb1 ] + %phi25 = phi float [ %i53, %bb3 ], [ %i53, %bb4 ], [ %i53, %bb5 ], [ %fval, %bb1 ] + %phi26 = phi float [ %i55, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi27 = phi float [ %i57, %bb3 ], [ %fval, %bb4 ], [ %i57, %bb5 ], [ %i57, %bb1 ] + %phi28 = phi float [ %i59, %bb3 ], [ %i59, %bb4 ], [ %fval, %bb5 ], [ %i59, %bb1 ] + %phi29 = phi float [ %i61, %bb3 ], [ %i61, %bb4 ], [ %i61, %bb5 ], [ %fval, %bb1 ] + %phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] + %phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ] + %phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ] + store float %phi31, float* undef + ret void +} Index: llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s define void @phi_float32(half %hval, float %fval) { ; MAX32-LABEL: @phi_float32(