Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -658,6 +658,15 @@ return getGeneration() < GFX9; } + // True if the hardware rewinds and replays GWS operations if a wave is + // preempted. + // + // If this is false, a GWS operation requires testing if a nack set the + // MEM_VIOL bit, and repeating if so. + bool hasGWSAutoReplay() const { + return getGeneration() >= GFX9; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -467,7 +467,7 @@ defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -let isConvergent = 1 in { +let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { let mayLoad = 0; } Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -323,6 +323,8 @@ OFFSET_WIDTH_ = 5, OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + OFFSET_MEM_VIOL = 8, + OFFSET_SRC_SHARED_BASE = 16, OFFSET_SRC_PRIVATE_BASE = 0 }; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -313,6 +313,9 @@ MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2923,6 +2923,109 @@ return SplitBB; } +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +static std::pair +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator I(&MI); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + + if (InstInLoop) { + auto Next = std::next(I); + + // Move instruction to loop body. + LoopBB->splice(LoopBB->begin(), &MBB, I, Next); + + // Move the rest of the block. + RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); + } else { + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + } + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +MachineBasicBlock * +SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + + MachineBasicBlock::iterator I = LoopBB->end(); + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + assert(Src && "missing operand from GWS instruction"); + + const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + + // Clear TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(0) + .addImm(EncodedReg); + + // This is a pain, but we're not allowed to have physical register live-ins + // yet. Insert a pair of copies if the VGPR0 hack is necessary. + if (TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) + .add(*Src); + + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) + .addReg(Data0); + + MRI.setSimpleHint(Data0, Src->getReg()); + } + + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Load and check TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) + .addImm(EncodedReg); + + // FIXME: Do we need to use an isel pseudo that may clobber scc? + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Reg, RegState::Kill) + .addImm(0); + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(LoopBB); + + return RemainderBB; +} + // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the // wavefront. If the value is uniform and just happens to be in a VGPR, this // will only do one iteration. In the worst case, this will loop 64 times. @@ -3062,24 +3165,9 @@ BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) .addReg(Exec); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); @@ -3631,6 +3719,14 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_BARRIER: + if (getSubtarget()->hasGWSAutoReplay()) + return BB; + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -1,14 +1,23 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s ; Minimum offset ; GCN-LABEL: {{^}}gws_barrier_offset0: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; LOOP-NEXT: ds_gws_barrier v0 offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void @@ -16,10 +25,10 @@ ; Maximum offset ; GCN-LABEL: {{^}}gws_barrier_offset63: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:64 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 offset:64 gds{{$}} define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63) ret void @@ -27,11 +36,11 @@ ; FIXME: Should be able to shift directly into m0 ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_barrier v0 gds{{$}} +; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 gds{{$}} define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) ret void @@ -39,11 +48,11 @@ ; Variable offset in SGPR with constant add ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} +; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}} define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { %offset = add i32 %offset.base, 1 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) @@ -51,12 +60,12 @@ } ; GCN-LABEL: {{^}}gws_barrier_vgpr_offset: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 gds{{$}} define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) @@ -65,12 +74,12 @@ ; Variable offset in VGPR with constant add ; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_barrier v0 offset:3 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_barrier v0 offset:3 gds{{$}} define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 { %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() %vgpr.offset = add i32 %vgpr.offset.base, 3 @@ -82,8 +91,8 @@ ; Check if m0 initialization is shared ; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset: -; GCN: s_mov_b32 m0, -1 -; GCN-NOT: s_mov_b32 m0 +; NOLOOP: s_mov_b32 m0, -1 +; NOLOOP-NOT: s_mov_b32 m0 define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 { store i32 1, i32 addrspace(3)* @lds call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10) @@ -93,9 +102,9 @@ ; Make sure this increments lgkmcnt ; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: -; GCN: ds_gws_barrier v0 offset:1 gds{{$}} -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 +; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}} +; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_setpc_b64 define void @gws_barrier_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void @@ -103,9 +112,8 @@ ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_barrier_wait_before: -; GCN: store_dword -; CIPLUS-NOT: s_waitcnt -; GCN: ds_gws_barrier v0 offset:8 gds +; NOLOOP: s_waitcnt +; NOLOOP-NOT: s_waitcnt define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -113,9 +121,9 @@ } ; GCN-LABEL: {{^}}gws_barrier_wait_after: -; GCN: ds_gws_barrier v0 offset:8 gds -; GCN-NEXT: s_waitcnt expcnt(0){{$}} -; GCN-NEXT: load_dword +; NOLOOP: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}} +; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) %load = load volatile i32, i32 addrspace(1)* %ptr @@ -124,9 +132,9 @@ ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_barrier_fence_before: -; GCN: store_dword -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN: ds_gws_barrier v0 offset:8 gds +; NOLOOP: store_dword +; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0) +; NOLOOP: ds_gws_barrier v0 offset:8 gds define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr fence release @@ -135,9 +143,10 @@ } ; GCN-LABEL: {{^}}gws_barrier_fence_after: -; GCN: ds_gws_barrier v0 offset:8 gds -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: load_dword +; NOLOOP: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) fence release @@ -147,9 +156,9 @@ ; FIXME: Should a wait be inserted here, or is an explicit fence needed? ; GCN-LABEL: {{^}}gws_init_barrier: -; GCN: s_mov_b32 m0, -1 -; GCN: ds_gws_init v0 offset:8 gds -; GCN-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP: s_mov_b32 m0, -1 +; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -158,10 +167,11 @@ ; FIXME: Why vmcnt, not expcnt? ; GCN-LABEL: {{^}}gws_init_fence_barrier: -; GCN: s_mov_b32 m0, -1 -; GCN: ds_gws_init v0 offset:8 gds -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP: s_mov_b32 m0, -1 +; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) fence release Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -1,14 +1,23 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; Minimum offset ; GCN-LABEL: {{^}}gws_init_offset0: ; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] ; GCN-DAG: s_mov_b32 m0, -1{{$}} ; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:1 gds{{$}} +; NOLOOP: ds_gws_init v0 offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; LOOP-NEXT: ds_gws_init v0 offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) ret void @@ -16,10 +25,19 @@ ; Maximum offset ; GCN-LABEL: {{^}}gws_init_offset63: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: s_mov_b32 m0, -1{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:64 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_init v0 offset:64 gds{{$}} + + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; LOOP-NEXT: ds_gws_init v0 offset:64 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63) ret void @@ -27,11 +45,11 @@ ; FIXME: Should be able to shift directly into m0 ; GCN-LABEL: {{^}}gws_init_sgpr_offset: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_init v0 gds{{$}} +; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; NOLOOP: ds_gws_init v0 gds{{$}} define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) ret void @@ -39,11 +57,11 @@ ; Variable offset in SGPR with constant add ; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] -; GCN: ds_gws_init v0 offset:1 gds{{$}} +; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; NOLOOP: ds_gws_init v0 offset:1 gds{{$}} define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { %offset = add i32 %offset.base, 1 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) @@ -51,12 +69,12 @@ } ; GCN-LABEL: {{^}}gws_init_vgpr_offset: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_init v0 gds{{$}} define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) @@ -65,12 +83,12 @@ ; Variable offset in VGPR with constant add ; GCN-LABEL: {{^}}gws_init_vgpr_offset_add: -; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] -; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} -; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] -; GCN: ds_gws_init v0 offset:3 gds{{$}} +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 { %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() %vgpr.offset = add i32 %vgpr.offset.base, 3 @@ -82,8 +100,8 @@ ; Check if m0 initialization is shared. ; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset: -; GCN: s_mov_b32 m0, -1 -; GCN-NOT: s_mov_b32 m0 +; NOLOOP: s_mov_b32 m0, -1 +; NOLOOP-NOT: s_mov_b32 m0 define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { store i32 1, i32 addrspace(3)* @lds call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10) @@ -92,9 +110,9 @@ } ; GCN-LABEL: {{^}}gws_init_lgkmcnt: -; GCN: ds_gws_init v0 offset:1 gds{{$}} -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 +; NOLOOP: ds_gws_init v0 offset:1 gds{{$}} +; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_setpc_b64 define void @gws_init_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) ret void @@ -102,9 +120,8 @@ ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_init_wait_before: -; GCN: store_dword -; CIPLUS-NOT: s_waitcnt -; GCN: ds_gws_init v0 offset:8 gds +; NOLOOP: s_waitcnt +; NOLOOP-NOT: s_waitcnt define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)