Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -476,11 +476,24 @@ Opd1PropInfo, Opd2PropInfo); } +// Return true if there's a potential benefit from using v2f16 instructions for +// an intrinsic, even if it requires nontrivial legalization. +static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::fma: // TODO: fmuladd + // There's a small benefit to using vector ops in the legalized code. + case Intrinsic::round: + return true; + default: + return false; + } +} + template int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Args, FastMathFlags FMF, unsigned VF, const Instruction *I) { - if (ID != Intrinsic::fma) + if (!intrinsicHasPackedVectorBenefit(ID)) return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); EVT OrigTy = TLI->getValueType(DL, RetTy); @@ -502,8 +515,14 @@ if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost()); + // TODO: Get more refined intrinsic costs? + unsigned InstRate = getQuarterRateInstrCost(); + if (ID == Intrinsic::fma) { + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() + : getQuarterRateInstrCost(); + } + + return LT.first * NElts * InstRate; } int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s + +; GCN-LABEL: @round_v2f16( +; GFX7: call half @llvm.round.f16( +; GFX7: call half @llvm.round.f16( + +; GFX8: call <2 x half> @llvm.round.v2f16( +define <2 x half> @round_v2f16(<2 x half> %arg) { +bb: + %tmp = extractelement <2 x half> %arg, i64 0 + %tmp1 = tail call half @llvm.round.half(half %tmp) + %tmp2 = insertelement <2 x half> undef, half %tmp1, i64 0 + %tmp3 = extractelement <2 x half> %arg, i64 1 + %tmp4 = tail call half @llvm.round.half(half %tmp3) + %tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1 + ret <2 x half> %tmp5 +} + +; TODO: Should probably not really be vectorizing this +; GCN-LABEL: @round_v2f32( +; GCN: call <2 x float> @llvm.round.v2f32 +define <2 x float> @round_v2f32(<2 x float> %arg) { +bb: + %tmp = extractelement <2 x float> %arg, i64 0 + %tmp1 = tail call float @llvm.round.f32(float %tmp) + %tmp2 = insertelement <2 x float> undef, float %tmp1, i64 0 + %tmp3 = extractelement <2 x float> %arg, i64 1 + %tmp4 = tail call float @llvm.round.f32(float %tmp3) + %tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1 + ret <2 x float> %tmp5 +} + +declare half @llvm.round.half(half) #0 +declare float @llvm.round.f32(float) #0 + +attributes #0 = { nounwind readnone speculatable willreturn }