diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1925,6 +1925,10 @@ Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_reduce_add : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + def int_amdgcn_inverse_ballot : Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createSILowerReduceAndScanPseudoPass(); FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); @@ -145,6 +146,9 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSIExpandReduceAndScanPseudoPass(PassRegistry &); +extern char &SIExpandReduceAndScanPseudoID; + void initializeSILowerWWMCopiesPass(PassRegistry &); extern char &SILowerWWMCopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -364,6 +364,7 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSIExpandReduceAndScanPseudoPass(*PR); initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); @@ -1199,6 +1200,7 @@ AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + addPass(&SIExpandReduceAndScanPseudoID); return false; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -145,6 +145,7 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp + SILowerReduceAndScanPseudo.cpp SILowerWWMCopies.cpp SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -26,6 +26,14 @@ namespace AMDGPU { struct ImageDimIntrinsicInfo; + +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +std::pair +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop); } class SITargetLowering final : public AMDGPUTargetLowering { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3578,8 +3578,9 @@ // be the first instruction in the remainder block. // /// \returns { LoopBody, Remainder } -static std::pair -splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { +std::pair +llvm::AMDGPU::splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, + bool InstInLoop) { MachineFunction *MF = MBB.getParent(); MachineBasicBlock::iterator I(&MI); @@ -3645,7 +3646,8 @@ if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) Src->setIsKill(false); - std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + std::tie(LoopBB, RemainderBB) = + llvm::AMDGPU::splitBlockForLoop(MI, *BB, true); MachineBasicBlock::iterator I = LoopBB->end(); @@ -3798,7 +3800,8 @@ MachineBasicBlock *LoopBB; MachineBasicBlock *RemainderBB; - std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); + std::tie(LoopBB, RemainderBB) = + llvm::AMDGPU::splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -258,6 +258,13 @@ } } // End Defs = [SCC] +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def REDUCE_ADD_PSEUDO : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src), + [(set i32:$sdst, (int_amdgcn_reduce_add i32:$src))]> { + } +} + let usesCustomInserter = 1, Defs = [VCC, EXEC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), diff --git a/llvm/lib/Target/AMDGPU/SILowerReduceAndScanPseudo.cpp b/llvm/lib/Target/AMDGPU/SILowerReduceAndScanPseudo.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILowerReduceAndScanPseudo.cpp @@ -0,0 +1,211 @@ + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "expand-reduce-and-scan" + +namespace { +struct SIExpandReduceAndScanPseudo : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + SIExpandReduceAndScanPseudo() : MachineFunctionPass(ID) {} + + /// runOnMachineFunction - pass entry point + bool runOnMachineFunction(MachineFunction &) override; + + void expandPseudo(MachineInstr &MI, unsigned AtomicOpc, bool isSGPR); + + void reduceUniformSGPRValue(MachineInstr &MI); + + void reduceDivergentVGPRValue(MachineInstr &MI, unsigned AtomicOpc); + + StringRef getPassName() const override { + return "SI Lower Reduce and Scan Pseudos"; + } + +private: + const SIRegisterInfo *TRI; + const GCNSubtarget *ST; + const SIInstrInfo *TII; + MachineRegisterInfo *MRI; +}; + +} // end anonymous namespace + +char SIExpandReduceAndScanPseudo::ID = 0; +char &llvm::SIExpandReduceAndScanPseudoID = SIExpandReduceAndScanPseudo::ID; + +INITIALIZE_PASS_BEGIN(SIExpandReduceAndScanPseudo, DEBUG_TYPE, + "Expand Reduction and Scan Pseudos", false, false) +INITIALIZE_PASS_END(SIExpandReduceAndScanPseudo, DEBUG_TYPE, + "Expand Reduction and Scan Pseudos", false, false) + +void SIExpandReduceAndScanPseudo::reduceUniformSGPRValue(MachineInstr &MI) { + + Register DstReg = MI.getOperand(0).getReg(); + + Register BitCntDst = MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + unsigned BitCntOp = + ST->isWave32() ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; + auto BitCntMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(BitCntOp), BitCntDst) + .addReg(TRI->getExec()); + + Register BitCntReg = BitCntMI->getOperand(0).getReg(); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::S_MUL_I32), + DstReg) + .addReg(BitCntReg) + .addReg(MI.getOperand(1).getReg()); +} + +void SIExpandReduceAndScanPseudo::reduceDivergentVGPRValue(MachineInstr &MI, + unsigned AtomicOpc) { + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineBasicBlock::iterator I = MBB->end(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + // Create Control flow for loop + MachineBasicBlock *ComputeLoop; + MachineBasicBlock *ComputeEnd; + + // Split MI's Machine Basic block into For loop + std::tie(ComputeLoop, ComputeEnd) = + llvm::AMDGPU::splitBlockForLoop(MI, *MBB, true); + + bool IsWave32 = ST->isWave32(); + const TargetRegisterClass *RegClass = + IsWave32 ? &AMDGPU::SReg_32RegClass : &AMDGPU::SReg_64RegClass; + + // Create Registers required for lowering. + Register LoopIterator = MRI->createVirtualRegister(RegClass); + Register InitalValReg = MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + + Register AccumulatorReg = + MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + Register NewAccumulatorReg = + MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + + Register ActiveBitsReg = MRI->createVirtualRegister(RegClass); + Register NewActiveBitsReg = MRI->createVirtualRegister(RegClass); + + Register FF1Reg = MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + Register LaneValueReg = MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecOpc = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + // Create Initail values of induction variable from Exec, Accumulator and + // Branch to ComputeBlock + auto &TmpSReg = + BuildMI(*MBB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecOpc); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg).addImm(0); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); + + // Start constructing ComputeLoop + I = ComputeLoop->end(); + auto Accumulator = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) + .addReg(InitalValReg) + .addMBB(MBB); + auto ActiveBits = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) + .addReg(TmpSReg->getOperand(0).getReg()) + .addMBB(MBB); + + // Perform the computations + unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; + auto &FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) + .addReg(ActiveBits->getOperand(0).getReg()); + auto &LaneValue = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) + .addReg(SrcReg) + .addReg(FF1->getOperand(0).getReg()); + auto &NewAccumulator = + BuildMI(*ComputeLoop, I, DL, TII->get(AtomicOpc), NewAccumulatorReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValue->getOperand(0).getReg()); + + // Manipulate the iterator to get the next active lane + unsigned BITSETOpc = IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; + auto &NewActiveBits = + BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) + .addReg(FF1->getOperand(0).getReg()) + .addReg(ActiveBits->getOperand(0).getReg()); + + // Add phi nodes + Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) + .addMBB(ComputeLoop); + ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()).addMBB(ComputeLoop); + + // Creating branching + unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; + BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) + .addReg(NewActiveBits->getOperand(0).getReg()) + .addImm(0); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(ComputeLoop); + + MRI->replaceRegWith(DstReg, NewAccumulator->getOperand(0).getReg()); +} + +void SIExpandReduceAndScanPseudo::expandPseudo(MachineInstr &MI, + unsigned AtomicOpc, + bool isSGPR) { + if (isSGPR) { + reduceUniformSGPRValue(MI); + } else { + reduceDivergentVGPRValue(MI, AtomicOpc); + } +} + +bool SIExpandReduceAndScanPseudo::runOnMachineFunction(MachineFunction &MF) { + + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + unsigned AtomicOpc; + bool isSGPR; + SmallVector ReduceInstrs; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::REDUCE_ADD_PSEUDO) { + ReduceInstrs.push_back(&MI); + } else { + // TODO: Support reduction and scan of other atomic operations + } + } + } + + // Process reduce instructions + for (auto &MI : ReduceInstrs) { + if (MI->getOpcode() == AMDGPU::REDUCE_ADD_PSEUDO) { + Register SrcReg = MI->getOperand(1).getReg(); + AtomicOpc = AMDGPU::S_ADD_U32; + isSGPR = TRI->isSGPRClass(MRI->getRegClass(SrcReg)); + } else { + // TODO: Get the appropriate Operation for remaining atomics + } + + expandPseudo(*MI, AtomicOpc, isSGPR); + MI->eraseFromParent(); + } + + return true; +} + +FunctionPass *llvm::createSILowerReduceAndScanPseudoPass() { + return new SIExpandReduceAndScanPseudo(); +} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -98,6 +98,7 @@ ; GCN-O0-NEXT: SI Fix SGPR copies ; GCN-O0-NEXT: MachinePostDominator Tree Construction ; GCN-O0-NEXT: SI Lower i1 Copies +; GCN-O0-NEXT: SI Lower Reduce and Scan Pseudos ; GCN-O0-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O0-NEXT: Local Stack Slot Allocation ; GCN-O0-NEXT: Register Usage Information Propagation @@ -296,6 +297,7 @@ ; GCN-O1-NEXT: SI Fix SGPR copies ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI Lower i1 Copies +; GCN-O1-NEXT: SI Lower Reduce and Scan Pseudos ; GCN-O1-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Early Tail Duplication @@ -582,6 +584,7 @@ ; GCN-O1-OPTS-NEXT: SI Fix SGPR copies ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Lower i1 Copies +; GCN-O1-OPTS-NEXT: SI Lower Reduce and Scan Pseudos ; GCN-O1-OPTS-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Early Tail Duplication @@ -885,6 +888,7 @@ ; GCN-O2-NEXT: SI Fix SGPR copies ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI Lower i1 Copies +; GCN-O2-NEXT: SI Lower Reduce and Scan Pseudos ; GCN-O2-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Early Tail Duplication @@ -1201,6 +1205,7 @@ ; GCN-O3-NEXT: SI Fix SGPR copies ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI Lower i1 Copies +; GCN-O3-NEXT: SI Lower Reduce and Scan Pseudos ; GCN-O3-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Early Tail Duplication diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.reduce.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.reduce.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; CHECK-LABEL: uniform_value: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b32 s3, exec_lo +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mul_i32 s2, s3, s2 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.reduce.add(i32 %in) + store i32 %result, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(1) %val) #0 { +; CHECK-LABEL: divergent_value: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_b32 v1, v0, s[2:3] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s3, exec_lo +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_ctz_i32_b32 s4, s3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s5, v1, s4 +; CHECK-NEXT: s_bitset0_b32 s3, s4 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: s_add_u32 s2, s2, s5 +; CHECK-NEXT: s_cmp_lg_u32 s3, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] offset:16 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm +entry: + %divergent_value.kernarg.segment = call nonnull align 16 dereferenceable(52) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %divergent_value.kernarg.segment, i64 36 + %loaded.out.kernarg.offset = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4 + %out.load1 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 0 + %mem.location = inttoptr i64 %out.load1 to ptr addrspace(1) + %val.load2 = extractelement <2 x i64> %loaded.out.kernarg.offset, i32 1 + %value.address = inttoptr i64 %val.load2 to ptr addrspace(1) + %lane = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %lane to i64 + %ele = getelementptr i32, ptr addrspace(1) %value.address, i64 %idxprom + %value = load i32, ptr addrspace(1) %ele, align 4 + %gep = getelementptr i32, ptr addrspace(1) %mem.location, i32 4 + %result = call i32 @llvm.amdgcn.reduce.add(i32 %value) + store i32 %result, ptr addrspace(1) %gep + ret void +} + +declare i32 @llvm.amdgcn.reduce.add(i32) +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 + +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + +attributes #0 = {"target-cpu"="gfx1100"} +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none)} + + + + + +