Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -116,7 +116,7 @@ static cl::opt EnableSIInsertWaitcntsPass( "enable-si-insert-waitcnts", cl::desc("Use new waitcnt insertion pass"), - cl::init(false)); + cl::init(true)); // Option to run late CFG structurizer static cl::opt LateCFGStructurize( Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -826,7 +826,8 @@ // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { @@ -1149,8 +1150,10 @@ // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) { - if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + uint64_t TSFlags = Inst.getDesc().TSFlags; + if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && + TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1183,7 +1186,7 @@ Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && - (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) { + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { @@ -1715,6 +1718,7 @@ MRI = &MF.getRegInfo(); MLI = &getAnalysis(); IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo(); AMDGPUASI = ST->getAMDGPUAS(); HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1859,5 +1863,19 @@ } } + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Modified = true; + } + return Modified; } Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -34,8 +34,6 @@ ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]] ; GCN: buffer_store_dword -; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; TODO: This waitcnt can be eliminated ; GCN: {{^}}[[END]]: ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/branch-condition-and.ll =================================================================== --- test/CodeGen/AMDGPU/branch-condition-and.ll +++ test/CodeGen/AMDGPU/branch-condition-and.ll @@ -19,9 +19,8 @@ ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 ; GCN: ds_write_b32 -; GCN: s_waitcnt -; GCN-NEXT: [[BB5]] +; GCN: [[BB5]] ; GCN: s_or_b64 exec, exec ; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -223,7 +223,6 @@ ; GCN-NEXT: [[BB2]]: ; %bb2 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17 ; GCN: buffer_store_dword [[BB2_K]] -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2 ; GCN-NEXT: s_getpc_b64 vcc @@ -393,7 +392,6 @@ ; GCN-NEXT: ; BB#2: ; %if_uniform ; GCN: buffer_store_dword -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: [[ENDIF]]: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]] Index: test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -37,22 +37,21 @@ ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: {{^}}BB{{[0-9]+}}_1: ; %if ; GCN: s_mov_b32 m0, -1 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] +; GCN: s_waitcnt lgkmcnt(0) ; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: s_waitcnt vmcnt(0) ; Spill val register ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] ; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_waitcnt vmcnt(0) ; VMEM: [[ENDIF]]: ; Reload and restore exec mask +; VGPR: s_waitcnt lgkmcnt(0) ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -119,7 +118,6 @@ ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execz [[END]] @@ -130,7 +128,6 @@ ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: s_and_b64 vcc, exec, vcc ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_cbranch_vccnz [[LOOP]] @@ -197,7 +194,6 @@ ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; FIXME: It makes no sense to put this skip here ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] @@ -235,7 +231,6 @@ ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execz [[ENDIF]] @@ -245,14 +240,12 @@ ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -120,8 +120,7 @@ ; FIXME: The waitcnt for the argument load can go after the loop ; IDXMODE: s_set_gpr_idx_on 0, src0 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec -; GCN: s_waitcnt lgkmcnt(0) - +; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}} ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0 @@ -250,8 +249,6 @@ ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; GCN: s_waitcnt lgkmcnt(0) - ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]] @@ -290,7 +287,6 @@ ; IDXMODE: s_set_gpr_idx_on 0, dst ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; GCN: s_waitcnt lgkmcnt(0) ; The offset depends on the register that holds the first element of the vector. ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]] @@ -330,9 +326,9 @@ ; IDXMODE: s_set_gpr_idx_on 0, src0 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec -; GCN: s_waitcnt vmcnt(0) ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] @@ -411,6 +407,7 @@ ; IDXMODE: s_set_gpr_idx_on 0, dst ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] Index: test/CodeGen/AMDGPU/infinite-loop.ll =================================================================== --- test/CodeGen/AMDGPU/infinite-loop.ll +++ test/CodeGen/AMDGPU/infinite-loop.ll @@ -4,8 +4,8 @@ ; SI-LABEL: {{^}}infinite_loop: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 ; SI: BB0_1: +; SI: s_waitcnt lgkmcnt(0) ; SI: buffer_store_dword [[REG]] -; SI: s_waitcnt vmcnt(0) expcnt(0) ; SI: s_branch BB0_1 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -58,7 +58,7 @@ ; ;CHECK-LABEL: {{^}}buffer_store_wait: ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) expcnt(0) +;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -58,7 +58,7 @@ ; ;CHECK-LABEL: {{^}}buffer_store_wait: ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) expcnt(0) +;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -5,7 +5,6 @@ ; FUNC-LABEL: {{^}}ds_swizzle: ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 -; CHECK: s_waitcnt lgkmcnt define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 store i32 %swizzle, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -130,7 +130,7 @@ ; ; GCN-LABEL: {{^}}image_store_wait: ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -; GCN: s_waitcnt vmcnt(0) expcnt(0) +; GCN: s_waitcnt expcnt(0) ; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) ; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll @@ -20,7 +20,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; encoding define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv() - call void @llvm.amdgcn.s.waitcnt(i32 0) + call void @llvm.amdgcn.s.waitcnt(i32 127) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll @@ -20,7 +20,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; encoding define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() - call void @llvm.amdgcn.s.waitcnt(i32 0) + call void @llvm.amdgcn.s.waitcnt(i32 127) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll @@ -18,7 +18,7 @@ ; VI: s_waitcnt lgkmcnt(0) ; encoding define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb() - call void @llvm.amdgcn.s.waitcnt(i32 0) + call void @llvm.amdgcn.s.waitcnt(i32 127) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll @@ -18,7 +18,7 @@ ; VI: s_waitcnt lgkmcnt(0) ; encoding define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() - call void @llvm.amdgcn.s.waitcnt(i32 0) + call void @llvm.amdgcn.s.waitcnt(i32 127) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -18,8 +18,8 @@ ; ; CHECK-LABEL: {{^}}test2: ; CHECK: image_load -; CHECK-NOT: s_waitcnt vmcnt(0){{$}} -; CHECK: s_waitcnt +; CHECK-NEXT: s_waitcnt +; CHECK: s_waitcnt vmcnt(0){{$}} ; CHECK-NEXT: image_store define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) { %t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -362,6 +362,7 @@ ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ; return define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 { Index: test/CodeGen/AMDGPU/ret_jump.ll =================================================================== --- test/CodeGen/AMDGPU/ret_jump.ll +++ test/CodeGen/AMDGPU/ret_jump.ll @@ -65,7 +65,6 @@ ; GCN-NEXT: ; %unreachable.bb ; GCN: ds_write_b32 -; GCN: s_waitcnt ; GCN: ; divergent unreachable ; GCN: ; %ret.bb @@ -73,6 +72,7 @@ ; GCN: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec +; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return ; GCN-NEXT: .Lfunc_end define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { Index: test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll =================================================================== --- test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -9,7 +9,6 @@ ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 ; GCN: ; divergent unreachable -; GCN: s_waitcnt ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec @@ -38,7 +37,6 @@ ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable ; GCN: ds_write_b32 ; GCN: ; divergent unreachable -; GCN: s_waitcnt ; GCN: [[RETURN]]: ; GCN-NEXT: s_or_b64 exec, exec @@ -66,7 +64,6 @@ ; GCN: [[UNREACHABLE]]: ; GCN: ds_write_b32 -; GCN: s_waitcnt define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 { bb: %tmp63 = icmp eq i32 %arg0, 32 Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -5,7 +5,7 @@ ; GCN-FUNC: {{^}}vccz_workaround: ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; GCN: s_waitcnt lgkmcnt(0) +; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -18,13 +18,11 @@ ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill -; TOVMEM: s_waitcnt vmcnt(0) ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOSMEM: s_add_u32 m0, s3, 0x100{{$}} ; TOSMEM-NOT: [[M0_COPY]] ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill -; TOSMEM: s_waitcnt lgkmcnt(0) ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -11,7 +11,6 @@ ; SI: v_cmp_lt_i32_e32 vcc, 0, ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] ; SI-NEXT: s_cbranch_execz [[FLOW_BB]] @@ -72,7 +71,6 @@ ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword -; SI-NEXT: s_waitcnt ; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_or_b64 exec, exec, [[BR_SREG]] @@ -101,7 +99,6 @@ ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword -; SI-NEXT: s_waitcnt ; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_or_b64 exec, exec, [[BR_SREG]] @@ -132,7 +129,6 @@ ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit ; SI: ds_write_b32 -; SI: s_waitcnt ; SI-NEXT: {{^}}[[FLOW]]: ; SI-NEXT: s_or_saveexec_b64 @@ -140,8 +136,8 @@ ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then -; SI: buffer_store_dword -; SI-NEXT: s_waitcnt +; SI: s_waitcnt +; SI-NEXT: buffer_store_dword ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock ; SI: s_or_b64 exec, exec Index: test/CodeGen/AMDGPU/waitcnt-permute.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -0,0 +1,33 @@ +# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s + +--- | + define float @waitcnt-permute(i32 %x, i32 %y) { + entry: + %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) + %1 = bitcast i32 %0 to float + %2 = fadd float 1.000000e+00, %1 + ret float %2 + } + + declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) + +... +--- +# CHECK-LABEL: name: waitcnt-permute{{$}} +# CHECK: DS_BPERMUTE_B32 +# CHECK-NEXT: S_WAITCNT 127 + +name: waitcnt-permute +liveins: + - { reg: '%vgpr0' } + - { reg: '%vgpr1' } + - { reg: '%sgpr30_sgpr31' } +body: | + bb.0: + liveins: %vgpr0, %vgpr1, %sgpr30_sgpr31 + + %vgpr0 = DS_BPERMUTE_B32 killed %vgpr0, killed %vgpr1, 0, implicit %exec + %vgpr0 = V_ADD_F32_e32 1065353216, killed %vgpr0, implicit %exec + S_SETPC_B64_return killed %sgpr30_sgpr31, implicit killed %vgpr0 + +...