Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -457,18 +457,27 @@ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -529,10 +538,44 @@ header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR | - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } header.kernarg_segment_byte_size = MFI->ABIArgOffset; header.wavefront_sgpr_count = KernelInfo.NumSGPR; Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1093,14 +1093,10 @@ SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); const SIMachineFunctionInfo *Info = MF.getInfo(); - unsigned ScratchOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -36,6 +36,16 @@ return true; } +static ArrayRef getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { if (!MF.getFrameInfo()->hasStackObjects()) @@ -43,7 +53,7 @@ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - const SIMachineFunctionInfo *MFI = MF.getInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); // If we only have SGPR spills, we won't actually be using scratch memory // since these spill to VGPRs. @@ -56,31 +66,159 @@ const SIInstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); assert(ScratchRsrcReg != AMDGPU::NoRegister); - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0"); + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1"); + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff); + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32); + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } } void SIFrameLowering::processFunctionBeforeFrameFinalized( Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -504,6 +504,11 @@ Align); // Alignment } +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, @@ -581,34 +586,28 @@ CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs += 4; + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - if (Subtarget->isAmdHsaOS()) { - unsigned DispatchPtrReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); - MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); - } + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->getShaderType() == ShaderType::COMPUTE) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); } AnalyzeFormalArguments(CCInfo, Splits); @@ -700,14 +699,114 @@ InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); } - if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info)) - Info->setScratchRSrcReg(TRI); + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } if (Chains.empty()) return Chain; @@ -1014,10 +1113,6 @@ EVT MemVT; SDValue Param; if (Subtarget->isAmdHsaOS()) { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - (void) MFI; - assert(MFI->hasDispatchPtr()); - BasePtr = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); // Local size value are 16-bits, but we always load 32-bit values and @@ -1070,8 +1165,12 @@ EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); - unsigned InputPtrReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned InputPtrReg = AMDGPU::NoRegister; + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + InputPtrReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + } // TODO: Should this propagate fast-math-flags? Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,16 +551,13 @@ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } @@ -638,14 +635,11 @@ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,10 +26,36 @@ /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: // FIXME: Make private @@ -38,12 +64,14 @@ std::map LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; - // Feature bits required for inputs passed in user / system SGPRs. + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; bool DispatchPtr : 1; bool QueuePtr : 1; bool DispatchID : 1; @@ -53,15 +81,27 @@ bool GridWorkgroupCountY : 1; bool GridWorkgroupCountZ : 1; + // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + public: struct SpilledReg { unsigned VGPR; @@ -80,6 +120,46 @@ unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } bool hasDispatchPtr() const { return DispatchPtr; @@ -129,6 +209,10 @@ return WorkGroupInfo; } + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + bool hasWorkItemIDX() const { return WorkItemIDX; } @@ -141,13 +225,37 @@ return WorkItemIDZ; } + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + /// \brief Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. unsigned getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(const SIRegisterInfo *TRI); + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } bool hasSpilledSGPRs() const { return HasSpilledSGPRs; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -30,15 +30,33 @@ : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), + NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), DispatchID(false), - KernargSegmentPtr(true), + KernargSegmentPtr(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -47,13 +65,17 @@ WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(true), WorkItemIDY(false), WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; @@ -66,14 +88,54 @@ if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; } -void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { - // We need to round up to next multiple of 4. - unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); - unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); - ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, - &AMDGPU::SReg_128RegClass); +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -29,6 +29,15 @@ public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -32,6 +32,40 @@ Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -69,19 +103,20 @@ } const SIMachineFunctionInfo *MFI = MF.getInfo(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { - unsigned ScratchOffsetPreloadReg - = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - // We will need to use this user SGPR argument for spilling, and thus never - // want it to be spilled. - reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); - // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg)); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } return Reserved; @@ -204,11 +239,10 @@ unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) .addReg(SOffset) .addImm(Offset) .addImm(0) // glc @@ -526,6 +560,9 @@ return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { @@ -533,28 +570,36 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - llvm_unreachable("currently unused"); + assert(STI.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: - return STI.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; case SIRegisterInfo::DISPATCH_PTR: - return AMDGPU::SGPR0_SGPR1; + assert(MFI->hasDispatchPtr()); + return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: llvm_unreachable("not implemented"); case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); Index: test/CodeGen/AMDGPU/hsa.ll =================================================================== --- test/CodeGen/AMDGPU/hsa.ll +++ test/CodeGen/AMDGPU/hsa.ll @@ -38,8 +38,10 @@ ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[2:3], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,31 +1,46 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s14, -1 -; CI: s_mov_b32 s15, 0x80f000 -; VI: s_mov_b32 s15, 0x800000 +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 ; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 0 ; GCNHSA: private_segment_alignment = 4 ; GCNHSA: .end_amd_kernel_code_t -; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCNHSA: s_mov_b32 s10, -1 -; CIHSA: s_mov_b32 s11, 0x180f000 -; VIHSA: s_mov_b32 s11, 0x11800000 -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -8,8 +8,8 @@ ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { @@ -29,8 +29,8 @@ ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR2_SGPR3 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { Index: test/CodeGen/AMDGPU/local-memory-two-objects.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: Index: test/CodeGen/AMDGPU/local-memory.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory.ll +++ test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -17,16 +17,18 @@ ; GCN-LABEL: {{^}}spill_vgpr_compute: +; GCN: s_mov_b32 s16, s3 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x80f000 ; VI-NEXT: s_mov_b32 s15, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0x80f000 -; VI-NEXT: s_mov_b32 s11, 0x800000 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/work-item-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -9,9 +9,26 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + ; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] + define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -98,10 +115,24 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Z +; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + + ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 -; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 ; HSA: s_and_b32 [[VAL:s[0-9]+]], [[XY]], 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] @@ -116,10 +147,13 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].W +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; CI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[0:1], 0x1 -; VI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[0:1], 0x4 +; CI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[4:5], 0x4 ; HSA: s_lshr_b32 [[VAL:s[0-9]+]], [[XY_VAL]], 16 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] @@ -134,10 +168,13 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[2].X +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; CI-HSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI-HSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; CI-HSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x2 +; VI-HSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x8 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define void @local_size_z (i32 addrspace(1)* %out) { @@ -152,8 +189,8 @@ ; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 ; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 ; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 -; VI-HSA; s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA; s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 ; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff ; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] @@ -169,14 +206,17 @@ } ; FUNC-LABEL: {{^}}local_size_xz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + ; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 ; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 ; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 ; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 -; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 -; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 -; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x8 ; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] @@ -191,14 +231,17 @@ } ; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + ; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 ; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 ; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c ; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 -; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 -; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 -; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x8 ; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] @@ -213,16 +256,19 @@ } ; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + ; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 ; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 ; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 ; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 ; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c ; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 -; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 -; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 -; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[4:5], 0x8 ; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff ; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] @@ -255,14 +301,33 @@ ret void } -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 -; GCN-NOHSA: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} +; GCN: buffer_store_dword [[VVAL]] + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -270,9 +335,25 @@ } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -280,16 +361,46 @@ } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 ; GCN: buffer_store_dword v0 define void @tidig_x (i32 addrspace(1)* %out) { entry: @@ -298,7 +409,13 @@ ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 ; GCN: buffer_store_dword v1 define void @tidig_y (i32 addrspace(1)* %out) { entry: @@ -307,7 +424,12 @@ ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 ; GCN: buffer_store_dword v2 define void @tidig_z (i32 addrspace(1)* %out) { entry: