Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -345,8 +345,8 @@ const AMDGPUSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; + unsigned MaxSGPR = MFI->getNumUserSGPRs() - 1; + unsigned MaxVGPR = MFI->getNumUserVGPRs() - 1; bool VCCUsed = false; bool FlatUsed = false; const SIRegisterInfo *RI = Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -738,8 +738,8 @@ ((Info->getPSInputAddr() & 0x7F) == 0 || ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); + CCInfo.AllocateReg(Info->addArgUserReg(*TRI, AMDGPU::VGPR0, 4)); + CCInfo.AllocateReg(Info->addArgUserReg(*TRI, AMDGPU::VGPR1, 4)); Info->markPSInputAllocated(0); Info->PSInputEna |= 1; } @@ -830,50 +830,59 @@ Info->ABIArgOffset = Offset + MemVT.getStoreSize(); continue; } + assert(VA.isRegLoc() && "Parameter must be in a register!"); + // Currently only in register types are 32-bit or 64-bit. Only the first + // register in the pair is returned by getLocReg. + bool IsSGPRArg = Arg.Flags.isInReg() || Arg.Flags.isByVal(); unsigned Reg = VA.getLocReg(); - - if (VT == MVT::i64) { - // For now assume it is a pointer - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SReg_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - InVals.push_back(Copy); - continue; + unsigned RegSize = VT.getStoreSize(); + const TargetRegisterClass *RC = IsSGPRArg ? + TRI->getSGPRSizeClass(RegSize) : TRI->getVGPRSizeClass(RegSize); + if (RegSize == 8) { + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); + assert(Reg != AMDGPU::NoRegister); } - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + if (!Arg.VT.isVector()) { + unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize); - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + CCInfo.AllocateReg(NewReg); + Reg = MF.addLiveIn(NewReg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Val); + continue; + } - if (Arg.VT.isVector()) { + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); + SmallVector Regs; + unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize); - SmallVector Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); + CCInfo.AllocateReg(NewReg); + Reg = MF.addLiveIn(NewReg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Val); - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize); + CCInfo.AllocateReg(NewReg); + Reg = MF.addLiveIn(NewReg, RC); - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); } - InVals.push_back(Val); + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); } // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -73,6 +73,7 @@ unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; + unsigned NumUserVGPRs; private: bool HasSpilledSGPRs; @@ -112,6 +113,10 @@ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } + MCPhysReg getNextUserVGPR() const { + return AMDGPU::VGPR0 + NumUserVGPRs; + } + public: struct SpilledReg { unsigned VGPR; @@ -138,6 +143,9 @@ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); + unsigned addArgUserReg(const SIRegisterInfo &TRI, + unsigned CurReg, unsigned Size); + // Add system SGPRs. unsigned addWorkGroupIDX() { WorkGroupIDXSystemSGPR = getNextSystemSGPR(); @@ -249,6 +257,10 @@ return NumUserSGPRs + NumSystemSGPRs; } + unsigned getNumUserVGPRs() const { + return NumUserVGPRs; + } + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { return PrivateSegmentWaveByteOffsetSystemSGPR; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,7 @@ PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), + NumUserVGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), HasNonSpillStackObjects(false), @@ -102,8 +103,18 @@ // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. - if (WorkItemIDZ) + if (WorkItemIDZ) { WorkItemIDY = true; + } + + if (WorkItemIDX) + ++NumUserVGPRs; + + if (WorkItemIDY) + ++NumUserVGPRs; + + if (WorkItemIDZ) + ++NumUserVGPRs; bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo->hasStackObjects(); @@ -174,6 +185,42 @@ return FlatScratchInitUserSGPR; } +unsigned SIMachineFunctionInfo::addArgUserReg(const SIRegisterInfo &TRI, + unsigned CurReg, + unsigned Size) { + const TargetRegisterClass *RC = TRI.getPhysRegClass(CurReg); + + // VGPRs have no alignment restrictions + if (TRI.hasVGPRs(RC)) { + unsigned Reg = getNextUserVGPR(); + NumUserVGPRs += Size / 4; + return Reg; + } + + // SGPRs have alignment restrictions. + if (Size == 4) { + unsigned Reg = getNextUserSGPR(); + NumUserSGPRs += 1; + return Reg; + } + + assert(Size == 8 && + "user sgpr calling convention only has 4 or 8 byte types"); + + unsigned FirstReg = TRI.getSubReg(CurReg, AMDGPU::sub0); + + // Skip over padding register. We assume the register passed in is correctly + // aligned. + unsigned Reg = getNextUserSGPR(); + while (Reg != FirstReg) { + ++Reg; + ++NumUserSGPRs; + } + + NumUserSGPRs += 2; + return CurReg; +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -117,11 +117,18 @@ /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const; + const TargetRegisterClass *SRC) const { + return getVGPRSizeClass(SRC->getSize()); + } /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentSGPRClass( - const TargetRegisterClass *VRC) const; + const TargetRegisterClass *VRC) const { + return getSGPRSizeClass(VRC->getSize()); + } + + static const TargetRegisterClass *getVGPRSizeClass(unsigned Size); + static const TargetRegisterClass *getSGPRSizeClass(unsigned Size); /// \returns The register class that is used for a sub-register of \p RC for /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -699,9 +699,8 @@ } } -const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const { - switch (SRC->getSize()) { +const TargetRegisterClass *SIRegisterInfo::getVGPRSizeClass(unsigned Size) { + switch (Size) { case 4: return &AMDGPU::VGPR_32RegClass; case 8: @@ -719,9 +718,9 @@ } } -const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( - const TargetRegisterClass *VRC) const { - switch (VRC->getSize()) { + +const TargetRegisterClass *SIRegisterInfo::getSGPRSizeClass(unsigned Size) { + switch (Size) { case 4: return &AMDGPU::SGPR_32RegClass; case 8: Index: test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll +++ test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll @@ -1,10 +1,93 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; GCN-LABEL: {{^}}unused_ptr_0: +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s2 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s3 +; GCN: NumSgprs: 8 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4 +define amdgpu_vs void @unused_ptr_0(i32 addrspace(2)* inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 { + store volatile i32 addrspace(2)* %arg1, i32 addrspace(2)* addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}unused_ptr_1: +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1 +; GCN: NumSgprs: 8 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4 +define amdgpu_vs void @unused_ptr_1(i32 addrspace(2)* inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 { + store volatile i32 addrspace(2)* %arg0, i32 addrspace(2)* addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}unused_i32_ptr_0: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s0 +; GCN: NumSgprs: 8 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4 +define amdgpu_vs void @unused_i32_ptr_0(i32 inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 { + store volatile i32 %arg0, i32 addrspace(1)* null + ret void +} + +; XGCN-LABEL: {{^}}f64_input: +; XGCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0 +; XGCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1 +; XGCN: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +;define amdgpu_vs void @f64_input(double inreg %arg0) #0 { +; store volatile double %arg0, double addrspace(1)* null +; ret void +;} -; GCN-LABEL: {{^}}shader_cc: +; GCN-LABEL: {{^}}unused_ptr_i32_0: +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1 +; GCN: NumSgprs: 8 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 3 +define amdgpu_vs void @unused_ptr_i32_0(i32 addrspace(2)* inreg %arg0, i32 inreg %arg1) #0 { + store volatile i32 addrspace(2)* %arg0, i32 addrspace(2)* addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}unused_i32_v4i32_0: +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s2 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s3 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s4 +; GCN: NumSgprs: 5 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 5 +define amdgpu_vs void @unused_i32_v4i32_0(i32 inreg %arg0, <4 x i32> inreg %arg1) #0 { + store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}shader_cc_0: ; GCN: v_add_i32_e32 v0, vcc, s8, v0 -define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { +; GCN: NumSgprs: 11 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 9 +define amdgpu_cs float @shader_cc_0(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) #0 { + %vi = bitcast float %v to i32 + %x = add i32 %vi, %w + %xf = bitcast i32 %x to float + ret float %xf +} + +; GCN-LABEL: {{^}}shader_cc_1: +; GCN: v_add_i32_e32 v0, vcc, s6, v0 +; GCN: NumSgprs: 9 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 7 +define amdgpu_cs float @shader_cc_1(<3 x i32> inreg, <3 x i32> inreg, i32 inreg %w, float %v) #0 { + %vi = bitcast float %v to i32 + %x = add i32 %vi, %w + %xf = bitcast i32 %x to float + ret float %xf +} + +; GCN-LABEL: {{^}}shader_cc_2: +; GCN: v_add_i32_e32 v0, vcc, s6, v0 +; GCN: NumSgprs: 9 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 7 +define amdgpu_cs float @shader_cc_2(<3 x i32> inreg, i64 inreg, i32 inreg %w, float %v) #0 { %vi = bitcast float %v to i32 %x = add i32 %vi, %w %xf = bitcast i32 %x to float @@ -13,9 +96,13 @@ ; GCN-LABEL: {{^}}kernel_cc: ; GCN: s_endpgm -define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { +; GCN: NumSgprs: 2 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) #0 { %vi = bitcast float %v to i32 %x = add i32 %vi, %w %xf = bitcast i32 %x to float ret float %xf } + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -88,11 +88,12 @@ ;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm -define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) { +define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <4 x float> %arg3, i32 %arg4) { main_body: - call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) - call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) + %arg2 = load volatile <8 x i32>, <8 x i32> addrspace(2)* undef + call void @llvm.amdgcn.image.store.i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false) + call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } Index: test/CodeGen/AMDGPU/register-count-comments.ll =================================================================== --- test/CodeGen/AMDGPU/register-count-comments.ll +++ test/CodeGen/AMDGPU/register-count-comments.ll @@ -26,3 +26,19 @@ store i32 %x, i32 addrspace(1)* %out, align 4 ret void } + +; SI-LABEL: {{^}}one_vgpr_used_3_enabled: +; SI: NumVgprs: 3 +define void @one_vgpr_used_3_enabled(i32 addrspace(1)* %out) nounwind { + %x = call i32 @llvm.amdgcn.workitem.id.x() + %y = call i32 @llvm.amdgcn.workitem.id.y() + %z = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %x, i32 addrspace(1)* %out, align 4 + store volatile i32 %y, i32 addrspace(1)* %out, align 4 + store volatile i32 %z, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.z() nounwind readnone Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,18 +11,20 @@ ; GCN-LABEL: {{^}}main: +; GCN-DAG: s_mov_b32 s16, s12 ; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-DAG: s_mov_b32 s14, -1 ; SI-DAG: s_mov_b32 s15, 0x98f000 ; VI-DAG: s_mov_b32 s15, 0x980000 -; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload +; s16 is offset user SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 +; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 12 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) { bb: