diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -716,18 +716,72 @@ unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; if (isShader(F.getCallingConv())) { + bool IsPixelShader = + F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); + + // Calculate the number of VGPR registers based on the SPI input registers + uint32_t InputEna = 0; + uint32_t InputAddr = 0; + unsigned LastEna = 0; + + if (IsPixelShader) { + // Note for IsPixelShader: + // By this stage, all enabled inputs are tagged in InputAddr as well. + // We will use InputAddr to determine whether the input counts against the + // vgpr total and only use the InputEnable to determine the last input + // that is relevant - if extra arguments are used, then we have to honour + // the InputAddr for any intermediate non-enabled inputs. + InputEna = MFI->getPSInputEnable(); + InputAddr = MFI->getPSInputAddr(); + + // We only need to consider input args up to the last used arg. + assert((InputEna || InputAddr) && + "PSInputAddr and PSInputEnable should " + "never both be 0 for AMDGPU_PS shaders"); + // There are some rare circumstances where InputAddr is non-zero and + // InputEna can be set to 0. In this case we default to setting LastEna + // to 1. + LastEna = InputEna ? findLastSet(InputEna) + 1 : 1; + } + // FIXME: We should be using the number of registers determined during // calling convention lowering to legalize the types. const DataLayout &DL = F.getParent()->getDataLayout(); + unsigned PSArgCount = 0; + unsigned IntermediateVGPR = 0; for (auto &Arg : F.args()) { unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) + if (Arg.hasAttribute(Attribute::InReg)) { WaveDispatchNumSGPR += NumRegs; - else - WaveDispatchNumVGPR += NumRegs; + } else { + // If this is a PS shader and we're processing the PS Input args (first + // 16 VGPR), use the InputEna and InputAddr bits to define how many + // VGPRs are actually used. + // Any extra VGPR arguments are handled as normal arguments (and + // contribute to the VGPR count whether they're used or not). + if (IsPixelShader && PSArgCount < 16) { + if ((1 << PSArgCount) & InputAddr) { + if (PSArgCount < LastEna) + WaveDispatchNumVGPR += NumRegs; + else + IntermediateVGPR += NumRegs; + } + PSArgCount++; + } else { + // If there are extra arguments we have to include the allocation for + // the non-used (but enabled with InputAddr) input arguments + if (IntermediateVGPR) { + WaveDispatchNumVGPR += IntermediateVGPR; + IntermediateVGPR = 0; + } + WaveDispatchNumVGPR += NumRegs; + } + } } ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); - ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); + ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); + ProgInfo.NumVGPR = + Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR); } // Adjust number of registers used to meet default/requested minimum/maximum diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -43,6 +43,10 @@ bool HasIndirectCall = false; int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + // Total number of VGPRs is actually a combination of AGPR and VGPR + // depending on architecture - and some alignment constraints + int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, + int32_t NumVGPR) const; int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -82,11 +82,16 @@ ST.getTargetID().isXnackOnOrAny()); } +int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { + if (ST.hasGFX90AInsts() && ArgNumAGPR) + return alignTo(ArgNumVGPR, 4) + ArgNumAGPR; + return std::max(ArgNumVGPR, ArgNumAGPR); +} + int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( const GCNSubtarget &ST) const { - if (ST.hasGFX90AInsts() && NumAGPR) - return alignTo(NumVGPR, 4) + NumAGPR; - return std::max(NumVGPR, NumAGPR); + return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); } bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll @@ -0,0 +1,363 @@ +;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg: +; ;CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1, 0 + ret { <4 x float> } %ret2 +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_3_arg: +; CHECK: NumVgprs: 6 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_3_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0 + ret { <4 x float> } %ret2 +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_gap: +; CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_gap(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i3 = extractelement <2 x float> %arg5, i32 1 + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1 + %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0 + ret { <4 x float> } %ret2 +} + +; Using InitialPSInputAddr of 0x2 causes the 2nd VGPR arg to be included in the packing - this increases the total number of VGPRs and in turn makes arg3 not be packed to be +; adjacent to arg1 (the only 2 used arguments) +; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_no_pack: +; CHECK: NumVgprs: 6 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_no_pack(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #1 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i3 = extractelement <2 x float> %arg5, i32 1 + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1 + %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0 + ret { <4 x float> } %ret2 +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg: +; CHECK: NumVgprs: 24 +define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %i4 = extractelement <3 x float> %arg6, i32 1 + %i5 = extractelement <2 x float> %arg7, i32 0 + %i6 = extractelement <2 x float> %arg8, i32 0 + %i7 = extractelement <2 x float> %arg9, i32 1 + + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3 + + %ret2 = insertelement <4 x float> undef, float %i5, i32 0 + %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1 + %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2 + %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3 + + %ret3 = insertelement <4 x float> undef, float %arg11, i32 0 + %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1 + %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2 + %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3 + + %arg15.f = bitcast i32 %arg15 to float + %arg16.f = bitcast i32 %arg16 to float + %arg17.f = bitcast i32 %arg17 to float + %arg18.f = bitcast i32 %arg18 to float + + %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0 + %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1 + %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2 + %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3 + + %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0 + %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1 + %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2 + %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3 + + ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res +} + +; Extra arguments have to be allocated even if they're unused +; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra_unused: +; CHECK: NumVgprs: 26 +define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %i4 = extractelement <3 x float> %arg6, i32 1 + %i5 = extractelement <2 x float> %arg7, i32 0 + %i6 = extractelement <2 x float> %arg8, i32 0 + %i7 = extractelement <2 x float> %arg9, i32 1 + + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3 + + %ret2 = insertelement <4 x float> undef, float %i5, i32 0 + %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1 + %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2 + %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3 + + %ret3 = insertelement <4 x float> undef, float %arg11, i32 0 + %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1 + %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2 + %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3 + + %arg15.f = bitcast i32 %arg15 to float + %arg16.f = bitcast i32 %arg16 to float + %arg17.f = bitcast i32 %arg17 to float + %arg18.f = bitcast i32 %arg18 to float + + %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0 + %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1 + %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2 + %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3 + + %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0 + %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1 + %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2 + %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3 + + ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra: +; CHECK: NumVgprs: 26 +; CHECK: NumVGPRsForWavesPerEU: 26 +define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %i4 = extractelement <3 x float> %arg6, i32 1 + %i5 = extractelement <2 x float> %arg7, i32 0 + %i6 = extractelement <2 x float> %arg8, i32 0 + %i7 = extractelement <2 x float> %arg9, i32 1 + + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3 + + %ret2 = insertelement <4 x float> undef, float %i5, i32 0 + %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1 + %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2 + %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3 + + %ret3 = insertelement <4 x float> undef, float %arg11, i32 0 + %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1 + %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2 + %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3 + + %arg15.f = bitcast i32 %arg15 to float + %arg16.f = bitcast i32 %arg16 to float + %arg17.f = bitcast i32 %arg17 to float + %arg18.f = bitcast i32 %arg18 to float + + %arg15_16.f = fadd float %arg15.f, %arg16.f + %arg17_18.f = fadd float %arg17.f, %arg18.f + + %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0 + %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1 + %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15_16.f, i32 2 + %ret4.3 = insertelement <4 x float> %ret4.2, float %arg17_18.f, i32 3 + + %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0 + %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1 + %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2 + %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3 + + ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res +} + +; Check that when no input args are used we get the minimum allocation - note that we always enable the first input +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused: +; CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { +.entry: + ret { <4 x float> } undef +} + +; Check that when no input args are used we get the minimum allocation - note that we always enable the first input +; Additionally set the PSInputAddr to 0 via the metadata +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0: +; CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 { +.entry: + ret { <4 x float> } undef +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used: +; CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0 + %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1 + + %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0 + + ret { <4 x float> } %ret.res +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_used: +; CHECK: NumVgprs: 5 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %ret4.1 = insertelement <4 x float> undef, float %arg14, i32 0 + %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg1, i32 1 + %ret4.3 = insertelement <4 x float> %ret4.2, float %extra_arg2, i32 2 + + %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0 + + ret { <4 x float> } %ret.res +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_unused: +; CHECK: NumVgprs: 7 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %ret4.1 = insertelement <4 x float> undef, float %arg12, i32 0 + %ret4.2 = insertelement <4 x float> %ret4.1, float %arg13, i32 1 + %ret4.3 = insertelement <4 x float> %ret4.2, float %arg14, i32 2 + + %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0 + + ret { <4 x float> } %ret.res +} + +; Extra unused inputs are always added to the allocation +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused: +; CHECK: NumVgprs: 4 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + + ret { <4 x float> } undef +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used_no_packing: +; CHECK: NumVgprs: 26 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 { +.entry: + %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0 + %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1 + + %ret.res = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0 + + ret { <4 x float> } %ret.res +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused_no_packing: +; CHECK: NumVgprs: 26 +define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 { +.entry: + ret { <4 x float> } undef +} + +; CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_arg_extra: +; CHECK: NumVgprs: 24 +; CHECK: NumVGPRsForWavesPerEU: 24 +define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %i4 = extractelement <3 x float> %arg6, i32 1 + %i5 = extractelement <2 x float> %arg7, i32 0 + %i6 = extractelement <2 x float> %arg8, i32 0 + %i7 = extractelement <2 x float> %arg9, i32 1 + + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3 + + %ret2 = insertelement <4 x float> undef, float %i5, i32 0 + %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1 + %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2 + %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3 + + %ret3 = insertelement <4 x float> undef, float %arg11, i32 0 + %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1 + %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2 + %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3 + + %arg15.f = bitcast i32 %arg15 to float + %arg16.f = bitcast i32 %arg16 to float + + %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0 + %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1 + %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15.f, i32 2 + %ret4.3 = insertelement <4 x float> %ret4.2, float %arg16.f, i32 3 + + %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0 + %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1 + %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2 + %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3 + + ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res +} + +;CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_no_packing_arg_extra: +;CHECK: NumVgprs: 26 +;CHECK: NumVGPRsForWavesPerEU: 26 +define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_no_packing_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 { +.entry: + %i1 = extractelement <2 x float> %arg3, i32 1 + %i2 = extractelement <2 x float> %arg4, i32 0 + %i3 = extractelement <2 x float> %arg5, i32 1 + %i4 = extractelement <3 x float> %arg6, i32 1 + %i5 = extractelement <2 x float> %arg7, i32 0 + %i6 = extractelement <2 x float> %arg8, i32 0 + %i7 = extractelement <2 x float> %arg9, i32 1 + + %ret1 = insertelement <4 x float> undef, float %i1, i32 0 + %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1 + %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2 + %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3 + + %ret2 = insertelement <4 x float> undef, float %i5, i32 0 + %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1 + %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2 + %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3 + + %ret3 = insertelement <4 x float> undef, float %arg11, i32 0 + %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1 + %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2 + %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3 + + %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0 + %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1 + + %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0 + %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1 + %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2 + %ret.res = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.1, 3 + + ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res +} + +attributes #0 = { nounwind "target-features"=",+wavefrontsize64,+cumode" } +attributes #1 = { nounwind "InitialPSInputAddr"="2" "target-features"=",+wavefrontsize64,+cumode" } +attributes #2 = { nounwind "InitialPSInputAddr"="0xffff" "target-features"=",+wavefrontsize64,+cumode" } +attributes #3 = { nounwind "InitialPSInputAddr"="0" "target-features"=",+wavefrontsize64,+cumode" }