diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -115,21 +115,26 @@ return TargetTransformInfo::TCC_Basic; } - static inline int getHalfRateInstrCost() { - return 2 * TargetTransformInfo::TCC_Basic; + static inline int getHalfRateInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { + return CostKind == TTI::TCK_CodeSize ? 2 + : 2 * TargetTransformInfo::TCC_Basic; } // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe // should be 2 or 4. - static inline int getQuarterRateInstrCost() { - return 3 * TargetTransformInfo::TCC_Basic; + static inline int getQuarterRateInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { + return CostKind == TTI::TCK_CodeSize ? 2 + : 4 * TargetTransformInfo::TCC_Basic; } - // On some parts, normal fp64 operations are half rate, and others - // quarter. This also applies to some integer operations. - inline int get64BitInstrCost() const { - return ST->hasHalfRate64Ops() ? - getHalfRateInstrCost() : getQuarterRateInstrCost(); + // On some parts, normal fp64 operations are half rate, and others + // quarter. This also applies to some integer operations. + inline int get64BitInstrCost( + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const { + return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -472,9 +472,50 @@ // FIXME: We're having to query the throughput cost so that the basic // implementation tries to generate legalize and scalarization costs. Maybe // we could hoist the scalarization code here? - return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, Opd1PropInfo, - Opd2PropInfo, Args, CxtI); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + // Scalarization + + // Check if any of the operands are vector operands. + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + + bool IsFloat = Ty->isFPOrFPVectorTy(); + // Assume that floating point arithmetic operations cost twice as much as + // integer operations. + unsigned OpCost = (IsFloat ? 2 : 1); + + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. + // TODO: Once we have extract/insert subvector cost we need to use them. + return LT.first * OpCost; + } + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered, then assume that the code is twice + // as expensive. + return LT.first * 2 * OpCost; + } + + // Else, assume that we need to scalarize this op. + // TODO: If one of the types get legalized by splitting, handle this + // similarly to what getCastInstrCost() does. + if (auto *VTy = dyn_cast(Ty)) { + unsigned Num = cast(VTy)->getNumElements(); + unsigned Cost = getArithmeticInstrCost( + Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return getScalarizationOverhead(VTy, Args) + Num * Cost; + } + + // We don't know anything about this scalar instruction. + return OpCost; } // Legalize the type. @@ -493,7 +534,7 @@ case ISD::SRL: case ISD::SRA: if (SLT == MVT::i64) - return get64BitInstrCost() * LT.first * NElts; + return get64BitInstrCost(CostKind) * LT.first * NElts; if (ST->has16BitInsts() && SLT == MVT::i16) NElts = (NElts + 1) / 2; @@ -515,7 +556,7 @@ return LT.first * NElts * getFullRateInstrCost(); case ISD::MUL: { - const int QuarterRateCost = getQuarterRateInstrCost(); + const int QuarterRateCost = getQuarterRateInstrCost(CostKind); if (SLT == MVT::i64) { const int FullRateCost = getFullRateInstrCost(); return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; @@ -552,7 +593,7 @@ case ISD::FADD: case ISD::FSUB: if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; @@ -565,7 +606,9 @@ // FIXME: frem should be handled separately. The fdiv in it is most of it, // but the current lowering is also not entirely correct. if (SLT == MVT::f64) { - int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); + int Cost = 7 * get64BitInstrCost(CostKind) + + getQuarterRateInstrCost(CostKind) + + 3 * getHalfRateInstrCost(CostKind); // Add cost of workaround. if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); @@ -577,7 +620,7 @@ // TODO: This is more complicated, unsafe flags etc. if ((SLT == MVT::f32 && !HasFP32Denormals) || (SLT == MVT::f16 && ST->has16BitInsts())) { - return LT.first * getQuarterRateInstrCost() * NElts; + return LT.first * getQuarterRateInstrCost(CostKind) * NElts; } } @@ -587,12 +630,15 @@ // f32 fmul // v_cvt_f16_f32 // f16 div_fixup - int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); + int Cost = + 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); return LT.first * Cost * NElts; } if (SLT == MVT::f32 || SLT == MVT::f16) { - int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + // 4 more v_cvt_* insts without f16 insts support + int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + + 1 * getQuarterRateInstrCost(CostKind); if (!HasFP32Denormals) { // FP mode switches. @@ -642,7 +688,48 @@ Type *RetTy = ICA.getReturnType(); EVT OrigTy = TLI->getValueType(DL, RetTy); if (!OrigTy.isSimple()) { - return BaseT::getIntrinsicInstrCost(ICA, CostKind); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getIntrinsicInstrCost(ICA, CostKind); + + // TODO: Combine these two logic paths. + if (ICA.isTypeBasedOnly()) + return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + + Type *RetTy = ICA.getReturnType(); + unsigned VF = ICA.getVectorFactor(); + unsigned RetVF = + (RetTy->isVectorTy() ? cast(RetTy)->getNumElements() + : 1); + assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + const IntrinsicInst *I = ICA.getInst(); + const SmallVectorImpl &Args = ICA.getArgs(); + FastMathFlags FMF = ICA.getFlags(); + // Assume that we need to scalarize this intrinsic. + SmallVector Types; + for (const Value *Op : Args) { + Type *OpTy = Op->getType(); + assert(VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = FixedVectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = std::numeric_limits::max(); + if (RetVF > 1 || VF > 1) { + ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += + getScalarizationOverhead(cast(RetTy), true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, + ScalarizationCost, I); + return getIntrinsicInstrCost(Attrs, CostKind); } // Legalize the type. @@ -654,16 +741,16 @@ MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; // TODO: Get more refined intrinsic costs? - unsigned InstRate = getQuarterRateInstrCost(); + unsigned InstRate = getQuarterRateInstrCost(CostKind); if (ICA.getID() == Intrinsic::fma) { - InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost(); + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } return LT.first * NElts * InstRate; @@ -714,7 +801,7 @@ CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); - return LT.first * getHalfRateInstrCost(); + return LT.first * getHalfRateInstrCost(CostKind); } int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s -; ALL: 'fadd_f32' +; ALL-LABEL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr @@ -12,7 +12,7 @@ ret void } -; ALL: 'fadd_v2f32' +; ALL-LABEL: 'fadd_v2f32' ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float> define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr @@ -21,10 +21,8 @@ ret void } -; ALL: 'fadd_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> +; ALL-LABEL: 'fadd_v3f32' +; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b @@ -32,10 +30,8 @@ ret void } -; ALL: 'fadd_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> +; ALL-LABEL: 'fadd_v5f32' +; ALL: estimated cost of 5 for {{.*}} fadd <5 x float> define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fadd <5 x float> %vec, %b @@ -43,9 +39,10 @@ ret void } -; ALL: 'fadd_f64' +; ALL-LABEL: 'fadd_f64' ; FASTF64: estimated cost of 2 for {{.*}} fadd double -; SLOWF64: estimated cost of 3 for {{.*}} fadd double +; SLOWF64: estimated cost of 4 for {{.*}} fadd double +; SIZEALL: estimated cost of 2 for {{.*}} fadd double define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fadd double %vec, %b @@ -53,9 +50,10 @@ ret void } -; ALL: 'fadd_v2f64' +; ALL-LABEL: 'fadd_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double> define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fadd <2 x double> %vec, %b @@ -63,9 +61,10 @@ ret void } -; ALL: 'fadd_v3f64' +; ALL-LABEL: 'fadd_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double> define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fadd <3 x double> %vec, %b @@ -73,7 +72,7 @@ ret void } -; ALL: 'fadd_f16' +; ALL-LABEL: 'fadd_f16' ; ALL: estimated cost of 1 for {{.*}} fadd half define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr @@ -82,7 +81,7 @@ ret void } -; ALL: 'fadd_v2f16' +; ALL-LABEL: 'fadd_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half> define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { @@ -92,7 +91,7 @@ ret void } -; ALL: 'fadd_v3f16' +; ALL-LABEL: 'fadd_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half> define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { @@ -102,7 +101,7 @@ ret void } -; ALL: 'fadd_v4f16' +; ALL-LABEL: 'fadd_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half> define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -1,19 +1,18 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,NOFP16,NOFP16-FP32DENORM,SLOWFP32DENORMS %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,THRPTALL,CIFASTF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,THRPTALL,CISLOWF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,THRPTALL,SIFASTF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,THRPTALL,SISLOWF64,NOFP16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,THRPTALL,FP16,CISLOWF64 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP16,NOFP16-NOFP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s -; ALL: 'fdiv_f32_ieee' -; ALL: estimated cost of 10 for {{.*}} fdiv float +; ALL-LABEL: 'fdiv_f32_ieee' +; THRPTALL: estimated cost of 14 for {{.*}} fdiv float +; SIZEALL: estimated cost of 12 for {{.*}} fdiv float define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b @@ -21,8 +20,9 @@ ret void } -; ALL: 'fdiv_f32_ftzdaz' -; ALL: estimated cost of 12 for {{.*}} fdiv float +; ALL-LABEL: 'fdiv_f32_ftzdaz' +; THRPTALL: estimated cost of 16 for {{.*}} fdiv float +; SIZEALL: estimated cost of 14 for {{.*}} fdiv float define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float %vec, %b @@ -30,8 +30,9 @@ ret void } -; ALL: 'fdiv_v2f32_ieee' -; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'fdiv_v2f32_ieee' +; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b @@ -39,8 +40,9 @@ ret void } -; ALL: 'fdiv_v2f32_ftzdaz' -; ALL: estimated cost of 24 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'fdiv_v2f32_ftzdaz' +; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> %vec, %b @@ -48,10 +50,9 @@ ret void } -; ALL: 'fdiv_v3f32_ieee' -; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 36/30 when it is legal. -; ALL: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float> +; ALL-LABEL: 'fdiv_v3f32_ieee' +; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float> +; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -59,10 +60,9 @@ ret void } -; ALL: 'fdiv_v3f32_ftzdaz' -; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 36/30 when it is legal. -; ALL: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float> +; ALL-LABEL: 'fdiv_v3f32_ftzdaz' +; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float> +; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float> define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fdiv <3 x float> %vec, %b @@ -70,10 +70,9 @@ ret void } -; ALL: 'fdiv_v5f32_ieee' -; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 60/50 when it is legal. -; ALL: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float> +; ALL-LABEL: 'fdiv_v5f32_ieee' +; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float> +; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float> define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fdiv <5 x float> %vec, %b @@ -81,10 +80,9 @@ ret void } -; ALL: 'fdiv_v5f32_ftzdaz' -; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 60/50 when it is legal. -; ALL: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float> +; ALL-LABEL: 'fdiv_v5f32_ftzdaz' +; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float> +; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float> define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fdiv <5 x float> %vec, %b @@ -92,11 +90,13 @@ ret void } -; ALL: 'fdiv_f64' -; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double -; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double -; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double -; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double +; ALL-LABEL: 'fdiv_f64' +; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double +; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double +; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double +; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double +; SIZECI: estimated cost of 22 for {{.*}} fdiv double +; SIZESI: estimated cost of 25 for {{.*}} fdiv double define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double %vec, %b @@ -104,11 +104,13 @@ ret void } -; ALL: 'fdiv_v2f64' -; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double> -; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double> -; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double> -; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double> +; ALL-LABEL: 'fdiv_v2f64' +; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double> +; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double> +; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double> +; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double> +; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double> +; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double> define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fdiv <2 x double> %vec, %b @@ -116,11 +118,13 @@ ret void } -; ALL: 'fdiv_v3f64' -; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double> -; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double> -; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double> -; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double> +; ALL-LABEL: 'fdiv_v3f64' +; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double> +; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double> +; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double> +; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double> +; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double> +; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double> define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fdiv <3 x double> %vec, %b @@ -128,9 +132,11 @@ ret void } -; ALL: 'fdiv_f16_f32_ieee' -; NOFP16: estimated cost of 10 for {{.*}} fdiv half -; FP16: estimated cost of 10 for {{.*}} fdiv half +; ALL-LABEL: 'fdiv_f16_f32_ieee' +; NOFP16: estimated cost of 14 for {{.*}} fdiv half +; FP16: estimated cost of 12 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half +; SIZEF16: estimated cost of 8 for {{.*}} fdiv half define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b @@ -138,9 +144,11 @@ ret void } -; ALL: 'fdiv_f16_f32_ftzdaz' -; NOFP16: estimated cost of 12 for {{.*}} fdiv half -; FP16: estimated cost of 10 for {{.*}} fdiv half +; ALL-LABEL: 'fdiv_f16_f32_ftzdaz' +; NOFP16: estimated cost of 16 for {{.*}} fdiv half +; FP16: estimated cost of 12 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half +; SIZEF16: estimated cost of 8 for {{.*}} fdiv half define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half %vec, %b @@ -148,9 +156,11 @@ ret void } -; ALL: 'fdiv_v2f16_f32_ieee' -; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'fdiv_v2f16_f32_ieee' +; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b @@ -158,9 +168,11 @@ ret void } -; ALL: 'fdiv_v2f16_f32_ftzdaz' -; NOFP16: estimated cost of 24 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz' +; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> %vec, %b @@ -168,9 +180,11 @@ ret void } -; ALL: 'fdiv_v4f16_f32_ieee' -; NOFP16: estimated cost of 40 for {{.*}} fdiv <4 x half> -; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half> +; ALL-LABEL: 'fdiv_v4f16_f32_ieee' +; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half> +; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b @@ -178,9 +192,11 @@ ret void } -; ALL: 'fdiv_v4f16_f32_ftzdaz' -; NOFP16: estimated cost of 48 for {{.*}} fdiv <4 x half> -; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half> +; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz' +; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half> +; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> +; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half> +; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 { %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr %add = fdiv <4 x half> %vec, %b @@ -188,9 +204,9 @@ ret void } -; ALL: 'rcp_f32_ieee' -; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float -; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float +; ALL-LABEL: 'rcp_f32_ieee' +; THRPTALL: estimated cost of 14 for {{.*}} fdiv float +; SIZEALL: estimated cost of 12 for {{.*}} fdiv float define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float 1.0, %vec @@ -198,8 +214,9 @@ ret void } -; ALL: 'rcp_f32_ftzdaz' -; ALL: estimated cost of 3 for {{.*}} fdiv float +; ALL-LABEL: 'rcp_f32_ftzdaz' +; THRPTALL: estimated cost of 4 for {{.*}} fdiv float +; SIZEALL: estimated cost of 2 for {{.*}} fdiv float define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 { %vec = load float, float addrspace(1)* %vaddr %add = fdiv float 1.0, %vec @@ -207,9 +224,11 @@ ret void } -; ALL: 'rcp_f16_f32_ieee' -; NOFP16: estimated cost of 10 for {{.*}} fdiv half -; FP16: estimated cost of 3 for {{.*}} fdiv half +; ALL-LABEL: 'rcp_f16_f32_ieee' +; NOFP16: estimated cost of 14 for {{.*}} fdiv half +; FP16: estimated cost of 4 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half +; SIZEF16: estimated cost of 2 for {{.*}} fdiv half define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half 1.0, %vec @@ -217,9 +236,9 @@ ret void } -; ALL: 'rcp_f16_f32_ftzdaz' -; NOFP16: estimated cost of 3 for {{.*}} fdiv half -; FP16: estimated cost of 3 for {{.*}} fdiv half +; ALL-LABEL: 'rcp_f16_f32_ftzdaz' +; THRPTALL: estimated cost of 4 for {{.*}} fdiv half +; SIZEALL: estimated cost of 2 for {{.*}} fdiv half define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 { %vec = load half, half addrspace(1)* %vaddr %add = fdiv half 1.0, %vec @@ -227,11 +246,13 @@ ret void } -; ALL: 'rcp_f64' -; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double -; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double -; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double -; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double +; ALL-LABEL: 'rcp_f64' +; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double +; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double +; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double +; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double +; SIZECI: estimated cost of 22 for {{.*}} fdiv double +; SIZESI: estimated cost of 25 for {{.*}} fdiv double define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fdiv double 1.0, %vec @@ -239,9 +260,9 @@ ret void } -; ALL: 'rcp_v2f32_ieee' -; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float> -; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'rcp_v2f32_ieee' +; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> , %vec @@ -249,8 +270,9 @@ ret void } -; ALL: 'rcp_v2f32_ftzdaz' -; ALL: estimated cost of 6 for {{.*}} fdiv <2 x float> +; ALL-LABEL: 'rcp_v2f32_ftzdaz' +; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float> define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fdiv <2 x float> , %vec @@ -258,9 +280,11 @@ ret void } -; ALL: 'rcp_v2f16_f32_ieee' -; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'rcp_v2f16_f32_ieee' +; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> , %vec @@ -268,9 +292,9 @@ ret void } -; ALL: 'rcp_v2f16_f32_ftzdaz' -; NOFP16: estimated cost of 6 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half> +; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz' +; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half> +; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half> define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %add = fdiv <2 x half> , %vec diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -1,11 +1,12 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF32,FASTF16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s ; ALL-LABEL: 'fma_f32' -; SLOW32: estimated cost of 3 for {{.*}} call float @llvm.fma.f32 -; FAST32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 +; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32 +; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 +; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { %vec = load float, float addrspace(1)* %vaddr %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1 @@ -14,8 +15,9 @@ } ; ALL-LABEL: 'fma_v2f32' -; SLOW32: estimated cost of 6 for {{.*}} call <2 x float> @llvm.fma.v2f32 -; FAST32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1 @@ -24,8 +26,9 @@ } ; ALL-LABEL: 'fma_v3f32' -; SLOW32: estimated cost of 9 for {{.*}} call <3 x float> @llvm.fma.v3f32 -; FAST32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1 @@ -34,8 +37,9 @@ } ; ALL-LABEL: 'fma_v5f32' -; SLOW32: estimated cost of 15 for {{.*}} call <5 x float> @llvm.fma.v5f32 -; FAST32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1 @@ -44,8 +48,9 @@ } ; ALL-LABEL: 'fma_f64' -; SLOW64: estimated cost of 3 for {{.*}} call double @llvm.fma.f64 -; FAST64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 +; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64 +; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 +; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { %vec = load double, double addrspace(1)* %vaddr %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1 @@ -54,8 +59,9 @@ } ; ALL-LABEL: 'fma_v2f64' -; SLOW64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.fma.v2f64 -; FAST64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1 @@ -64,8 +70,9 @@ } ; ALL-LABEL: 'fma_v3f64' -; SLOW64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.fma.v3f64 -; FAST64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 +; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1 @@ -74,8 +81,9 @@ } ; ALL-LABEL: 'fma_f16' -; SLOW16: estimated cost of 3 for {{.*}} call half @llvm.fma.f16 -; FAST16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 +; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16 +; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 +; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { %vec = load half, half addrspace(1)* %vaddr %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1 @@ -84,8 +92,10 @@ } ; ALL-LABEL: 'fma_v2f16' -; SLOW16: estimated cost of 6 for {{.*}} call <2 x half> @llvm.fma.v2f16 -; FAST16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 +; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16 define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1 @@ -94,8 +104,10 @@ } ; ALL-LABEL: 'fma_v3f16' -; SLOW16: estimated cost of 12 for {{.*}} call <3 x half> @llvm.fma.v3f16 -; FAST16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 +; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16 define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -1,7 +1,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s ; ALL-LABEL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float @@ -22,9 +22,7 @@ } ; ALL-LABEL: 'fmul_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> +; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b @@ -33,9 +31,7 @@ } ; ALL-LABEL: 'fmul_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> +; ALL: estimated cost of 5 for {{.*}} fmul <5 x float> define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fmul <5 x float> %vec, %b @@ -45,7 +41,8 @@ ; ALL-LABEL: 'fmul_f64' ; FASTF64: estimated cost of 2 for {{.*}} fmul double -; SLOWF64: estimated cost of 3 for {{.*}} fmul double +; SLOWF64: estimated cost of 4 for {{.*}} fmul double +; SIZEALL: estimated cost of 2 for {{.*}} fmul double define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fmul double %vec, %b @@ -55,7 +52,8 @@ ; ALL-LABEL: 'fmul_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double> define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fmul <2 x double> %vec, %b @@ -65,7 +63,8 @@ ; ALL-LABEL: 'fmul_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double> define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fmul <3 x double> %vec, %b diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s -; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s -; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s +; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s -; ALL: 'fsub_f32' +; ALL-LABEL: 'fsub_f32' ; ALL: estimated cost of 1 for {{.*}} fsub float define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { %vec = load float, float addrspace(1)* %vaddr @@ -12,7 +12,7 @@ ret void } -; ALL: 'fsub_v2f32' +; ALL-LABEL: 'fsub_v2f32' ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float> define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr @@ -21,10 +21,8 @@ ret void } -; ALL: 'fsub_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. -; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float> +; ALL-LABEL: 'fsub_v3f32' +; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fsub <3 x float> %vec, %b @@ -32,10 +30,8 @@ ret void } -; ALL: 'fsub_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. -; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float> +; ALL-LABEL: 'fsub_v5f32' +; ALL: estimated cost of 5 for {{.*}} fsub <5 x float> define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fsub <5 x float> %vec, %b @@ -43,9 +39,10 @@ ret void } -; ALL: 'fsub_f64' +; ALL-LABEL: 'fsub_f64' ; FASTF64: estimated cost of 2 for {{.*}} fsub double -; SLOWF64: estimated cost of 3 for {{.*}} fsub double +; SLOWF64: estimated cost of 4 for {{.*}} fsub double +; SIZEALL: estimated cost of 2 for {{.*}} fsub double define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { %vec = load double, double addrspace(1)* %vaddr %add = fsub double %vec, %b @@ -53,9 +50,10 @@ ret void } -; ALL: 'fsub_v2f64' +; ALL-LABEL: 'fsub_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double> -; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double> +; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double> define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr %add = fsub <2 x double> %vec, %b @@ -63,9 +61,10 @@ ret void } -; ALL: 'fsub_v3f64' +; ALL-LABEL: 'fsub_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double> -; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double> +; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double> +; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double> define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr %add = fsub <3 x double> %vec, %b @@ -73,7 +72,7 @@ ret void } -; ALL: 'fsub_f16' +; ALL-LABEL: 'fsub_f16' ; ALL: estimated cost of 1 for {{.*}} fsub half define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { %vec = load half, half addrspace(1)* %vaddr @@ -82,7 +81,7 @@ ret void } -; ALL: 'fsub_v2f16' +; ALL-LABEL: 'fsub_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half> define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { @@ -92,7 +91,7 @@ ret void } -; ALL: 'fsub_v3f16' +; ALL-LABEL: 'fsub_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half> define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { @@ -102,7 +101,7 @@ ret void } -; ALL: 'fsub_v4f16' +; ALL-LABEL: 'fsub_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half> define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll @@ -1,11 +1,11 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,THRPTALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,SZNOCONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s target triple = "amdgcn--" @@ -113,8 +113,10 @@ ; ALL-LABEL: 'fmul_fadd_f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double -; ALL: estimated cost of 3 for instruction: %add = fadd double +; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double +; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double +; THRPTALL: estimated cost of 4 for instruction: %add = fadd double +; SIZEALL: estimated cost of 2 for instruction: %add = fadd double define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul double %r0, %r1 %add = fadd double %mul, %r2 @@ -123,7 +125,8 @@ ; ALL-LABEL: 'fmul_fadd_contract_f64': ; ALL: estimated cost of 0 for instruction: %mul = fmul contract double -; ALL: estimated cost of 3 for instruction: %add = fadd contract double +; THRPTALL: estimated cost of 4 for instruction: %add = fadd contract double +; SIZEALL: estimated cost of 2 for instruction: %add = fadd contract double define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul contract double %r0, %r1 %add = fadd contract double %mul, %r2 @@ -132,8 +135,10 @@ ; ALL-LABEL: 'fmul_fadd_v2f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> -; ALL: estimated cost of 6 for instruction: %add = fadd <2 x double> +; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> +; THRPTALL: estimated cost of 8 for instruction: %add = fadd <2 x double> +; SIZEALL: estimated cost of 4 for instruction: %add = fadd <2 x double> define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { %mul = fmul <2 x double> %r0, %r1 %add = fadd <2 x double> %mul, %r2 @@ -142,8 +147,10 @@ ; ALL-LABEL: 'fmul_fsub_f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double -; ALL: estimated cost of 3 for instruction: %sub = fsub double +; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double +; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double +; THRPTALL: estimated cost of 4 for instruction: %sub = fsub double +; SIZEALL: estimated cost of 2 for instruction: %sub = fsub double define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 { %mul = fmul double %r0, %r1 %sub = fsub double %mul, %r2 @@ -152,8 +159,10 @@ ; ALL-LABEL: 'fmul_fsub_v2f64': ; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double> -; ALL: estimated cost of 6 for instruction: %sub = fsub <2 x double> +; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> +; THRPTALL: estimated cost of 8 for instruction: %sub = fsub <2 x double> +; SIZEALL: estimated cost of 4 for instruction: %sub = fsub <2 x double> define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { %mul = fmul <2 x double> %r0, %r1 %sub = fsub <2 x double> %mul, %r2 diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -1,10 +1,11 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,THRPTALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s -; ALL: 'mul_i32' -; ALL: estimated cost of 3 for {{.*}} mul i32 +; ALL-LABEL: 'mul_i32' +; THRPTALL: estimated cost of 4 for {{.*}} mul i32 +; SIZEALL: estimated cost of 2 for {{.*}} mul i32 define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr %mul = mul i32 %vec, %b @@ -12,8 +13,9 @@ ret void } -; ALL: 'mul_v2i32' -; ALL: estimated cost of 6 for {{.*}} mul <2 x i32> +; ALL-LABEL: 'mul_v2i32' +; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32> +; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32> define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %mul = mul <2 x i32> %vec, %b @@ -21,10 +23,9 @@ ret void } -; ALL: 'mul_v3i32' -; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening, -; and 9 when it is legal. -; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32> +; ALL-LABEL: 'mul_v3i32' +; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32> +; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32> define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %mul = mul <3 x i32> %vec, %b @@ -32,10 +33,9 @@ ret void } -; ALL: 'mul_v5i32' -; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening, -; and 15 when it is legal. -; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32> +; ALL-LABEL: 'mul_v5i32' +; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32> +; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32> define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr %mul = mul <5 x i32> %vec, %b @@ -43,8 +43,9 @@ ret void } -; ALL: 'mul_v4i32' -; ALL: estimated cost of 12 for {{.*}} mul <4 x i32> +; ALL-LABEL: 'mul_v4i32' +; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32> +; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32> define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %mul = mul <4 x i32> %vec, %b @@ -52,8 +53,9 @@ ret void } -; ALL: 'mul_i64' -; ALL: estimated cost of 16 for {{.*}} mul i64 +; ALL-LABEL: 'mul_i64' +; THRPTALL: estimated cost of 20 for {{.*}} mul i64 +; SIZEALL: estimated cost of 12 for {{.*}} mul i64 define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %mul = mul i64 %vec, %b @@ -61,8 +63,9 @@ ret void } -; ALL: 'mul_v2i64' -; ALL: estimated cost of 32 for {{.*}} mul <2 x i64> +; ALL-LABEL: 'mul_v2i64' +; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64> +; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64> define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %mul = mul <2 x i64> %vec, %b @@ -70,8 +73,9 @@ ret void } -; ALL: 'mul_v3i64' -; ALL: estimated cost of 48 for {{.*}} mul <3 x i64> +; ALL-LABEL: 'mul_v3i64' +; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64> +; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64> define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %mul = mul <3 x i64> %vec, %b @@ -79,8 +83,9 @@ ret void } -; ALL: 'mul_v4i64' -; ALL: estimated cost of 64 for {{.*}} mul <4 x i64> +; ALL-LABEL: 'mul_v4i64' +; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64> +; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64> define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %mul = mul <4 x i64> %vec, %b @@ -89,8 +94,9 @@ } -; ALL: 'mul_v8i64' -; ALL: estimated cost of 256 for {{.*}} mul <8 x i64> +; ALL-LABEL: 'mul_v8i64' +; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64> +; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64> define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %mul = mul <8 x i64> %vec, %b @@ -98,8 +104,9 @@ ret void } -; ALL: 'mul_i16' -; ALL: estimated cost of 3 for {{.*}} mul i16 +; ALL-LABEL: 'mul_i16' +; THRPTALL: estimated cost of 4 for {{.*}} mul i16 +; SIZEALL: estimated cost of 2 for {{.*}} mul i16 define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr %mul = mul i16 %vec, %b @@ -107,9 +114,11 @@ ret void } -; ALL: 'mul_v2i16' -; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16> -; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16> +; ALL-LABEL: 'mul_v2i16' +; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16> +; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16> +; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16> +; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16> define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %mul = mul <2 x i16> %vec, %b @@ -117,9 +126,11 @@ ret void } -; ALL: 'mul_v3i16' -; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16> -; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16> +; ALL-LABEL: 'mul_v3i16' +; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16> +; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16> +; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16> +; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16> define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 { %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr %mul = mul <3 x i16> %vec, %b diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll @@ -1,9 +1,9 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s -; ALL: 'shl_i32' +; ALL-LABEL: 'shl_i32' ; ALL: estimated cost of 1 for {{.*}} shl i32 define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -12,9 +12,10 @@ ret void } -; ALL: 'shl_i64' +; ALL-LABEL: 'shl_i64' ; FAST64: estimated cost of 2 for {{.*}} shl i64 -; SLOW64: estimated cost of 3 for {{.*}} shl i64 +; SLOW64: estimated cost of 4 for {{.*}} shl i64 +; SIZEALL: estimated cost of 2 for {{.*}} shl i64 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = shl i64 %vec, %b @@ -22,7 +23,7 @@ ret void } -; ALL: 'shl_i16' +; ALL-LABEL: 'shl_i16' ; ALL: estimated cost of 1 for {{.*}} shl i16 define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -31,7 +32,7 @@ ret void } -; ALL: 'shl_v2i16' +; ALL-LABEL: 'shl_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16> define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { @@ -41,7 +42,7 @@ ret void } -; ALL: 'lshr_i32' +; ALL-LABEL: 'lshr_i32' ; ALL: estimated cost of 1 for {{.*}} lshr i32 define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -50,9 +51,10 @@ ret void } -; ALL: 'lshr_i64' +; ALL-LABEL: 'lshr_i64' ; FAST64: estimated cost of 2 for {{.*}} lshr i64 -; SLOW64: estimated cost of 3 for {{.*}} lshr i64 +; SLOW64: estimated cost of 4 for {{.*}} lshr i64 +; SIZEALL: estimated cost of 2 for {{.*}} lshr i64 define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = lshr i64 %vec, %b @@ -60,7 +62,7 @@ ret void } -; ALL: 'lshr_i16' +; ALL-LABEL: 'lshr_i16' ; ALL: estimated cost of 1 for {{.*}} lshr i16 define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -69,7 +71,7 @@ ret void } -; ALL: 'lshr_v2i16' +; ALL-LABEL: 'lshr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16> define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { @@ -79,7 +81,7 @@ ret void } -; ALL: 'ashr_i32' +; ALL-LABEL: 'ashr_i32' ; ALL: estimated cost of 1 for {{.*}} ashr i32 define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { %vec = load i32, i32 addrspace(1)* %vaddr @@ -88,9 +90,9 @@ ret void } -; ALL: 'ashr_i64' +; ALL-LABEL: 'ashr_i64' ; FAST64: estimated cost of 2 for {{.*}} ashr i64 -; SLOW64: estimated cost of 3 for {{.*}} ashr i64 +; SLOW64: estimated cost of 4 for {{.*}} ashr i64 define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { %vec = load i64, i64 addrspace(1)* %vaddr %or = ashr i64 %vec, %b @@ -98,7 +100,7 @@ ret void } -; ALL: 'ashr_i16' +; ALL-LABEL: 'ashr_i16' ; ALL: estimated cost of 1 for {{.*}} ashr i16 define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { %vec = load i16, i16 addrspace(1)* %vaddr @@ -107,7 +109,7 @@ ret void } -; ALL: 'ashr_v2i16' +; ALL-LABEL: 'ashr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16> define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {