diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -368,6 +368,10 @@ void addVRegDefDeps(SUnit *SU, unsigned OperIdx); void addVRegUseDeps(SUnit *SU, unsigned OperIdx); + /// Returns true if MI is an instruction we are unable to reason about + /// (like a call or something with unmodeled side effects). + virtual bool isGlobalMemoryObject(MachineInstr *MI); + /// Returns a mask for which lanes get read/written by the given (register) /// machine operand. LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const; diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -534,9 +534,7 @@ } } -/// Returns true if MI is an instruction we are unable to reason about -/// (like a call or something with unmodeled side effects). -static inline bool isGlobalMemoryObject(MachineInstr *MI) { +bool ScheduleDAGInstrs::isGlobalMemoryObject(MachineInstr *MI) { return MI->isCall() || MI->hasUnmodeledSideEffects() || (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -188,23 +188,6 @@ } }; -// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. -static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { - assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || - SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || - SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); - - while (!SU.Preds.empty()) - for (auto &P : SU.Preds) - SU.removePred(P); - - while (!SU.Succs.empty()) - for (auto &S : SU.Succs) - for (auto &SP : S.getSUnit()->Preds) - if (SP.getSUnit() == &SU) - S.getSUnit()->removePred(SP); -} - typedef std::pair> SUToCandSGsPair; typedef SmallVector SUsToCandSGsVec; @@ -375,7 +358,6 @@ // Command line requested IGroupLP doesn't have SGBarr if (!SGBarr) continue; - resetEdges(*SGBarr, DAG); SG.link(*SGBarr, false); } } @@ -1026,7 +1008,6 @@ initSchedGroupBarrierPipelineStage(R); foundSB = true; } else if (Opc == AMDGPU::IGLP_OPT) { - resetEdges(*R, DAG); if (!foundSB && !foundIGLP) initIGLPOpt(*R); foundIGLP = true; @@ -1046,7 +1027,6 @@ assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. - resetEdges(SchedBarrier, DAG); auto InvertedMask = invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); @@ -1097,7 +1077,6 @@ std::vector::reverse_iterator RIter) { // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due // to the instruction having side effects. - resetEdges(*RIter, DAG); MachineInstr &SGB = *RIter->getInstr(); assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); int32_t SGMask = SGB.getOperand(0).getImm(); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -232,6 +232,8 @@ std::unique_ptr createSchedStage(GCNSchedStageID SchedStageID); + bool isGlobalMemoryObject(MachineInstr *MI) override; + public: GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr S); @@ -413,6 +415,8 @@ bool HasIGLPInstrs = false; + bool isGlobalMemoryObject(MachineInstr *MI) override; + public: void schedule() override; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -27,6 +27,7 @@ #include "AMDGPUIGroupLP.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" #define DEBUG_TYPE "machine-scheduler" @@ -1524,6 +1525,23 @@ } } +static bool isIGLPInstr(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::IGLP_OPT: + case AMDGPU::SCHED_BARRIER: + case AMDGPU::SCHED_GROUP_BARRIER: + return true; + default: + return false; + } +} + +bool GCNScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) { + if (isIGLPInstr(MI)) + return false; + return ScheduleDAGInstrs::isGlobalMemoryObject(MI); +} + static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { return std::any_of( DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { @@ -1537,6 +1555,12 @@ bool RemoveKillFlags) : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} +bool GCNPostScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) { + if (isIGLPInstr(MI)) + return false; + return ScheduleDAGInstrs::isGlobalMemoryObject(MI); +} + void GCNPostScheduleDAGMILive::schedule() { HasIGLPInstrs = hasIGLPInstrs(this); if (HasIGLPInstrs) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -147,6 +147,44 @@ ret void } +; Ordering enforced by inline asm with sideeffects should be preserved. + +define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_iglp_opt_asm_sideeffect: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: ; iglp_opt mask(0x00000000) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_write_b32 v0, v1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ds_read_b32 v0, v2 offset:256 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_write_b32 v1, v0 offset:256 +; GCN-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr float, ptr addrspace(3) %in, i32 %idx + %load.0 = load float, ptr addrspace(3) %load.0.addr + %store.0.addr = getelementptr float, ptr addrspace(3) %out, i32 %idx + store float %load.0, ptr addrspace(3) %store.0.addr + call void asm sideeffect "", ""() #1 + call void @llvm.amdgcn.iglp.opt(i32 0) #1 + %load.1.addr = getelementptr float, ptr addrspace(3) %in, i32 64 + %load.1 = load float, ptr addrspace(3) %load.1.addr + %store.1.addr = getelementptr float, ptr addrspace(3) %out, i32 64 + store float %load.1, ptr addrspace(3) %store.1.addr + ret void +} + + declare void @llvm.amdgcn.iglp.opt(i32) #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir @@ -9,9 +9,11 @@ define amdgpu_kernel void @sched_barrier_mask_4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_16_asm_sideeffect(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_128(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_128_asm_sideeffect(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_256(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_mask_512(ptr addrspace(3) noalias %out, ptr addrspace(3) noalias %in) { ret void } define amdgpu_kernel void @sched_barrier_masks_8_12(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } @@ -279,6 +281,40 @@ S_ENDPGM 0 ... +# ASM with side-effects should act as a barrier for DS instructions. + +--- +name: sched_barrier_mask_16_asm_sideeffect +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_16_asm_sideeffect + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_BARRIER 16 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + INLINEASM &"", 1 + SCHED_BARRIER 16 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + # MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. --- @@ -358,7 +394,7 @@ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec ; CHECK-NEXT: SCHED_BARRIER 128 ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec - ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: S_ENDPGM 0 @@ -375,6 +411,40 @@ S_ENDPGM 0 ... +# ASM with side-effects should act as a barrier for DS instructions. + +--- +name: sched_barrier_mask_128_asm_sideeffect +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_128_asm_sideeffect + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_BARRIER 128 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec + DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_NOP 0 + INLINEASM &"", 1 + SCHED_BARRIER 128 + %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec + DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_ENDPGM 0 +... + # MASK = 0x0000 0100: ALL DS read instructions may be scheduled across SCHED_BARRIER. --- @@ -391,7 +461,7 @@ ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: SCHED_BARRIER 256 ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec - ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: S_ENDPGM 0 %0:sreg_64 = IMPLICIT_DEF @@ -422,7 +492,7 @@ ; CHECK-NEXT: SCHED_BARRIER 512 ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec - ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) ; CHECK-NEXT: S_ENDPGM 0