diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3657,14 +3657,22 @@ ``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``. Any requests beyond 16 will be ignored. - >448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT - _BUFFER + >448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT If the *Target Properties* + _BUFFER column of + :ref:`amdgpu-processor-table` + specifies *Architected flat + scratch* then not supported + and must be 0, >449 1 bit ENABLE_SGPR_DISPATCH_PTR >450 1 bit ENABLE_SGPR_QUEUE_PTR >451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR >452 1 bit ENABLE_SGPR_DISPATCH_ID - >453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT - + >453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT If the *Target Properties* + column of + :ref:`amdgpu-processor-table` + specifies *Architected flat + scratch* then not supported + and must be 0, >454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT _SIZE 457:455 3 bits Reserved, must be 0. @@ -3984,14 +3992,27 @@ ======= ======= =============================== =========================================================================== Bits Size Field Name Description ======= ======= =============================== =========================================================================== - 0 1 bit ENABLE_PRIVATE_SEGMENT Enable the setup of the - private segment. - - In addition, enable the - setup of the SGPR - wavefront scratch offset - system register (see - :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). + 0 1 bit ENABLE_PRIVATE_SEGMENT * Enable the setup of the + private segment. + * If the *Target Properties* + column of + :ref:`amdgpu-processor-table` + does not specify + *Architected flat + scratch* then enable the + setup of the SGPR + wavefront scratch offset + system register (see + :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). + * If the *Target Properties* + column of + :ref:`amdgpu-processor-table` + specifies *Architected + flat scratch* then enable + the setup of the + FLAT_SCRATCH register + pair (see + :ref:`amdgpu-amdhsa-initial-kernel-execution-state`). Used by CP to set up ``COMPUTE_PGM_RSRC2.SCRATCH_EN``. @@ -4550,12 +4571,26 @@ segment address when using the Scratch Segment Buffer (see :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`). +* If the *Target Properties* column of :ref:`amdgpu-processor-table` + specifies *Architected flat scratch*: + + If ENABLE_PRIVATE_SEGMENT is enabled in + :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx10-table` then the FLAT_SCRATCH + register pair will be initialized to the 64-bit address of the base of scratch + backing memory being managed by SPI for the queue executing the kernel + dispatch plus the value of the wave's Scratch Wavefront Offset for use as the + flat scratch base in flat memory instructions. + .. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer: Private Segment Buffer ++++++++++++++++++++++ -Private Segment Buffer SGPR register is used to initialize 4 SGPRs +If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies +*Architected flat scratch* then a Private Segment Buffer is not supported. +Instead the flat SCRATCH instructions are used. + +Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs that are used as a V# to access scratch. CP uses the value provided by the runtime. It is used, together with Scratch Wavefront Offset as an offset, to access the private memory space using a segment address. See diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -731,6 +731,12 @@ "Workitem IDs are packed into v0 at kernel launch" >; +def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch", + "HasArchitectedFlatScratch", + "true", + "Flat Scratch register is a readonly SPI initialized architected register" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -723,7 +723,9 @@ const SIRegisterInfo &TRI = TII->getRegisterInfo(); Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || - MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || + MRI.isLiveIn(MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat // instructions aren't used to access the scratch buffer. Inline assembly may diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -289,6 +289,7 @@ FlatGlobalInsts(false), FlatScratchInsts(false), ScalarFlatScratchInsts(false), + HasArchitectedFlatScratch(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), LDSMisalignedBug(false), @@ -327,7 +328,8 @@ } bool GCNSubtarget::enableFlatScratch() const { - return EnableFlatScratch && hasFlatScratchInsts(); + return flatScratchIsArchitected() || + (EnableFlatScratch && hasFlatScratchInsts()); } unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1373,6 +1373,10 @@ return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets]; } + bool hasArchitectedFlatScratch() const { + return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; + } + bool hasSGPR102_SGPR103() const { return !isVI() && !isGFX9(); } @@ -4549,6 +4553,10 @@ return OutOfRangeError(ValRange); KD.kernarg_size = Val; } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); @@ -4579,6 +4587,10 @@ if (Val) UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); @@ -4598,10 +4610,20 @@ KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { - PARSE_BITS_ENTRY( - KD.compute_pgm_rsrc2, - COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, - ValRange); + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange); + } else if (ID == ".amdhsa_enable_private_segment") { + if (!hasArchitectedFlatScratch()) + return Error( + IDRange.Start, + "directive is not supported without architected flat scratch", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val, @@ -4639,6 +4661,10 @@ } else if (ID == ".amdhsa_reserve_flat_scratch") { if (IVersion.Major < 7) return Error(IDRange.Start, "directive requires gfx7+", IDRange); + if (hasArchitectedFlatScratch()) + return Error(IDRange.Start, + "directive is not supported with architected flat scratch", + IDRange); if (!isUInt<1>(Val)) return OutOfRangeError(ValRange); ReserveFlatScr = Val; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -174,6 +174,8 @@ bool isGFX9Plus() const; bool isGFX10() const; bool isGFX10Plus() const; + + bool hasArchitectedFlatScratch() const; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1457,6 +1457,10 @@ return AMDGPU::isGFX10Plus(STI); } +bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { + return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; +} + //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// @@ -1516,7 +1520,8 @@ AMDGPU::IsaInfo::getSGPREncodingGranule(&STI); KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; - KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; + if (!hasArchitectedFlatScratch()) + KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; @@ -1567,9 +1572,12 @@ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { using namespace amdhsa; StringRef Indent = "\t"; - PRINT_DIRECTIVE( - ".amdhsa_system_sgpr_private_segment_wavefront_offset", - COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + if (hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_enable_private_segment", + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + else + PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset", + COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x", COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y", @@ -1710,8 +1718,9 @@ using namespace amdhsa; TwoByteBuffer = DE.getU16(Cursor); - PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", - KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + if (!hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr", KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr", @@ -1720,8 +1729,9 @@ KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id", KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", - KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (!hasArchitectedFlatScratch()) + PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", + KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -167,6 +167,7 @@ bool FlatGlobalInsts; bool FlatScratchInsts; bool ScalarFlatScratchInsts; + bool HasArchitectedFlatScratch; bool AddNoCarryInsts; bool HasUnpackedD16VMem; bool R600ALUInst; @@ -985,6 +986,10 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } + /// \returns true if the flat_scratch register is initialized by the HW. + /// In this case it is readonly. + bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } + /// \returns true if the machine has merged shaders in which s0-s7 are /// reserved by the hardware and user SGPRs start at s8 bool hasMergedShaders() const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -315,9 +315,11 @@ << KD.private_segment_fixed_size << '\n'; OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n'; - PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + if (!hasArchitectedFlatScratch(STI)) + PRINT_FIELD( + OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); @@ -330,9 +332,10 @@ PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (!hasArchitectedFlatScratch(STI)) + PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -340,10 +343,12 @@ PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); - PRINT_FIELD( - OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); + PRINT_FIELD(OS, + (hasArchitectedFlatScratch(STI) + ? ".amdhsa_enable_private_segment" + : ".amdhsa_system_sgpr_private_segment_wavefront_offset"), + KD, compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); @@ -372,7 +377,7 @@ if (!ReserveVCC) OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; - if (IVersion.Major >= 7 && !ReserveFlatScr) + if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI)) OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; if (Optional HsaAbiVer = getHsaAbiVersion(&STI)) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -493,7 +493,8 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { + if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) && + !ST.flatScratchIsArchitected()) { MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -124,13 +124,15 @@ if (WorkItemIDZ) WorkItemIDY = true; - PrivateSegmentWaveByteOffset = true; - - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(AMDGPU::SGPR5); + if (!ST.flatScratchIsArchitected()) { + PrivateSegmentWaveByteOffset = true; + + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(AMDGPU::SGPR5); + } } bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); @@ -162,7 +164,8 @@ KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch())) { + (isAmdHsaOrMesa || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls or stack objects that may require it before argument // lowering. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -740,6 +740,7 @@ bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); bool isGFX90A(const MCSubtargetInfo &STI); +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1459,6 +1459,10 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; } +bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -1,16 +1,24 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s ; Make sure flat_scratch_init is set ; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls: -; GCN: s_add_u32 flat_scratch_lo, s4, s7 -; GCN: s_addc_u32 flat_scratch_hi, s5, 0 -; GCN: flat_store_dword -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 -; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset -; GCN-NOT: .amdhsa_reserve_flat_scratch -; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 -; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7 +; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0 +; RO-FLAT-NOT: flat_scratch +; GCN: flat_store_dword +; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer +; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1 +; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init +; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset +; RW-FLAT-NOT: .amdhsa_enable_private_segment +; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset +; RO-FLAT: .amdhsa_enable_private_segment 1 +; GCN-NOT: .amdhsa_reserve_flat_scratch +; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 +; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { %alloca = alloca i32, addrspace(5) %cast = addrspacecast i32 addrspace(5)* %alloca to i32* @@ -20,15 +28,23 @@ ; TODO: Could optimize out in this case ; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls: -; GCN: s_add_u32 flat_scratch_lo, s4, s7 -; GCN: s_addc_u32 flat_scratch_hi, s5, 0 -; GCN: buffer_store_dword -; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 -; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 -; GCN-NOT: .amdhsa_reserve_flat_scratch -; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 -; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7 +; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0 +; RO-FLAT-NOT: flat_scratch +; RW-FLAT: buffer_store_dword +; RO-FLAT: scratch_store_dword +; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1 +; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer +; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1 +; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init +; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; RW-FLAT-NOT: .amdhsa_enable_private_segment +; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset +; RO-FLAT: .amdhsa_enable_private_segment 1 +; GCN-NOT: .amdhsa_reserve_flat_scratch +; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 +; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 define amdgpu_kernel void @stack_object_in_kernel_no_calls() { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -36,13 +52,20 @@ } ; GCN-LABEL: {{^}}kernel_no_calls_no_stack: -; GCN-NOT: flat_scratch -; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN: .amdhsa_reserve_flat_scratch 0 -; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 -; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4 +; GCN-NOT: flat_scratch +; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1 +; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer +; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0 +; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init +; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; RW-FLAT-NOT: .amdhsa_enable_private_segment +; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset +; RO-FLAT: .amdhsa_enable_private_segment 0 +; RW-FLAT: .amdhsa_reserve_flat_scratch 0 +; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch 0 +; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 +; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4 +; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 define amdgpu_kernel void @kernel_no_calls_no_stack() { ret void }