Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -935,6 +935,11 @@ /// applies when shouldMaximizeVectorBandwidth returns true. unsigned getMinimumVF(unsigned ElemWidth) const; + /// \return The maximum vectorization factor for types of given element + /// bit width and opcode, or 0 if there is no maximum VF. + /// Currently only used by the SLP vectorizer. + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -1492,6 +1497,7 @@ virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; + virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() const = 0; @@ -1911,6 +1917,9 @@ unsigned getMinimumVF(unsigned ElemWidth) const override { return Impl.getMinimumVF(ElemWidth); } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override { + return Impl.getMaximumVF(ElemWidth, Opcode); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -356,6 +356,8 @@ unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -635,6 +635,11 @@ return TTIImpl->getMinimumVF(ElemWidth); } +unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth, + unsigned Opcode) const { + return TTIImpl->getMaximumVF(ElemWidth, Opcode); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -170,6 +170,7 @@ unsigned getNumberOfRegisters(unsigned RCID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,12 @@ return 32; } +unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + if (Opcode == Instruction::Load || Opcode == Instruction::Store) + return 32 * 4 / ElemWidth; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; +} + unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const { Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -126,6 +126,10 @@ MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt +MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, + cl::desc("Maximum SLP vectorization factor")); + static cl::opt MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); @@ -741,6 +745,12 @@ return MinVecRegSize; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + unsigned MaxVF = MaxVFOption.getNumOccurrences() ? + MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); + return MaxVF ? MaxVF : UINT_MAX; + } + /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, @@ -6180,6 +6190,7 @@ unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); + MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -7623,7 +7634,6 @@ bool Changed = false; SmallVector Incoming; SmallPtrSet VisitedInstrs; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7650,27 +7660,8 @@ // Look for the next elements with the same type. SmallVector::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getType(); - - assert(EltTy->isSized() && - "Instructions should all be sized at this point"); - TypeSize EltTS = DL->getTypeSizeInBits(EltTy); - if (EltTS.isScalable()) { - // For now, just ignore vectorizing scalable types. - ++IncIt; - continue; - } - - unsigned EltSize = EltTS.getFixedSize(); - unsigned MaxNumElts = MaxVecRegSize / EltSize; - if (MaxNumElts < 2) { - ++IncIt; - continue; - } - while (SameTypeIt != E && - (*SameTypeIt)->getType() == EltTy && - static_cast(SameTypeIt - IncIt) < MaxNumElts) { + (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -123,12 +123,18 @@ ret <2 x i16> %ins.1 } -; FIXME: Should not vectorize define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @uadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -145,8 +151,15 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @usub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -163,8 +176,15 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @sadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -181,8 +201,15 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @ssub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -267,8 +294,14 @@ ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <4 x i16> [[TMP0]] +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) +; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_3]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -18,9 +18,9 @@ ret <2 x half> %tmp5 } -; TODO: Should probably not really be vectorizing this ; GCN-LABEL: @round_v2f32( -; GCN: call <2 x float> @llvm.round.v2f32 +; GCN: call float @llvm.round.f32( +; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { bb: %tmp = extractelement <2 x float> %arg, i64 0 Index: llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -1,7 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s + +; Make sure we do not vectorize to create PHI wider than requested. +; On AMDGPU target wider vectorization will result in a higher register pressure, +; spilling, or even inability to allocate registers. define void @phi_float32(half %hval, float %fval) { ; MAX32-LABEL: @phi_float32( @@ -120,6 +125,7 @@ ; MAX32-NEXT: [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ] ; MAX32-NEXT: [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ] ; MAX32-NEXT: [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ] +; MAX32-NEXT: store float [[PHI31]], float* undef, align 4 ; MAX32-NEXT: ret void ; ; MAX256-LABEL: @phi_float32( @@ -296,6 +302,8 @@ ; MAX256-NEXT: [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ] ; MAX256-NEXT: [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ] ; MAX256-NEXT: [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ] +; MAX256-NEXT: [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6 +; MAX256-NEXT: store float [[TMP157]], float* undef, align 4 ; MAX256-NEXT: ret void ; ; MAX1024-LABEL: @phi_float32( @@ -481,6 +489,8 @@ ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: ; MAX1024-NEXT: [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ] +; MAX1024-NEXT: [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30 +; MAX1024-NEXT: store float [[TMP166]], float* undef, align 4 ; MAX1024-NEXT: ret void ; bb: @@ -603,5 +613,6 @@ %phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] %phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ] %phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ] + store float %phi31, float* undef ret void }