Index: llvm/lib/Target/AMDGPU/SIPostRABundler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -48,10 +48,15 @@ SmallSet Defs; + bool isBundleCandidate(const MachineInstr &MI) const; bool isDependentLoad(const MachineInstr &MI) const; - + bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const; }; +constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | + SIInstrFlags::SMRD | SIInstrFlags::DS | + SIInstrFlags::FLAT | SIInstrFlags::MIMG; + } // End anonymous namespace. INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false) @@ -80,55 +85,75 @@ return false; } +bool SIPostRABundler::isBundleCandidate(const MachineInstr &MI) const { + const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; + return IMemFlags != 0 && MI.mayLoadOrStore() && !MI.isBundled(); +} + +bool SIPostRABundler::canBundle(const MachineInstr &MI, + const MachineInstr &NextMI) const { + const uint64_t IMemFlags = MI.getDesc().TSFlags & MemFlags; + + // TODO: It may be better to move meta instructions like dbg_value after + // the bundle. We're relying on the memory legalizer to unbundle these. + return (IMemFlags != 0 && MI.mayLoadOrStore() && !NextMI.isBundled() && + NextMI.mayLoad() == MI.mayLoad() && NextMI.mayStore() == MI.mayStore() && + ((NextMI.getDesc().TSFlags & MemFlags) == IMemFlags) && + !isDependentLoad(NextMI)); +} + bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; TRI = MF.getSubtarget().getRegisterInfo(); bool Changed = false; - const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | - SIInstrFlags::SMRD | SIInstrFlags::DS | - SIInstrFlags::FLAT | SIInstrFlags::MIMG; - for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::instr_iterator Next; MachineBasicBlock::instr_iterator B = MBB.instr_begin(); MachineBasicBlock::instr_iterator E = MBB.instr_end(); + for (auto I = B; I != E; I = Next) { Next = std::next(I); + if (!isBundleCandidate(*I)) + continue; - const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags; - - if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() || - B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() || - ((B->getDesc().TSFlags & MemFlags) != IMemFlags) || - isDependentLoad(*I)) { - - if (B != I) { - if (std::next(B) != I) { - finalizeBundle(MBB, B, I); - Changed = true; - } - Next = I; + assert(Defs.empty()); + + if (I->getNumExplicitDefs() != 0) + Defs.insert(I->defs().begin()->getReg()); + + MachineBasicBlock::instr_iterator BundleStart = I; + MachineBasicBlock::instr_iterator BundleEnd = I; + unsigned ClauseLength = 1; + for (I = Next; I != E; I = Next) { + Next = std::next(I); + + assert(BundleEnd != I); + if (canBundle(*BundleEnd, *I)) { + BundleEnd = I; + if (I->getNumExplicitDefs() != 0) + Defs.insert(I->defs().begin()->getReg()); + ++ClauseLength; + } else if (I->isMetaInstruction()) { + // Allow meta instructions in between bundle candidates, but do not + // start or end a bundle on one. + // + // TODO: It may be better to move meta instructions like dbg_value after + // the bundle. We're relying on the memory legalizer to unbundle these. + } else { + break; } - - B = Next; - Defs.clear(); - continue; } - if (I->getNumExplicitDefs() == 0) - continue; - - Defs.insert(I->defs().begin()->getReg()); - } + Next = std::next(BundleEnd); + if (ClauseLength > 1) { + Changed = true; + finalizeBundle(MBB, BundleStart, Next); + } - if (B != E && std::next(B) != E) { - finalizeBundle(MBB, B, E); - Changed = true; + Defs.clear(); } - - Defs.clear(); } return Changed; Index: llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+xnack -amdgpu-max-memory-clause=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Test the behavior of the post-RA soft clause bundler in the presence +; of debug info. The debug info should not interfere with the +; bundling, which could result in an observable codegen change. + +define amdgpu_kernel void @dbg_clause(float addrspace(1)* %out, float addrspace(1)* %aptr) !dbg !4 { +; GCN-LABEL: dbg_clause: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dword v1, v0, s[2:3] +; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1 +; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32 +; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, v1, v2 +; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 8 + %a = load float, float addrspace(1)* %gep0, align 4 + call void @llvm.dbg.value(metadata float %a, metadata !8, metadata !DIExpression()), !dbg !9 + %b = load float, float addrspace(1)* %gep1, align 4 + call void @llvm.dbg.value(metadata float %b, metadata !10, metadata !DIExpression()), !dbg !11 + %fadd = fadd float %a, %b + store float %fadd, float addrspace(1)* %out.gep, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind readnone speculatable willreturn } +attributes #1 = { nofree nosync nounwind readnone speculatable willreturn } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(types: !6) +!6 = !{null, !7} +!7 = !DIBasicType(name: "float", size: 32, align: 32) +!8 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1) +!9 = !DILocation(line: 1, column: 42, scope: !4) +!10 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 2) +!11 = !DILocation(line: 2, column: 42, scope: !4) Index: llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -56,12 +56,12 @@ ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) ; GCN: } ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) ; GCN: } ; GCN: S_NOP 0 ; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0 @@ -112,3 +112,111 @@ $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec ... + +# Middle dbg_value should be bundled +--- +name: bundle_dbg_value_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: bundle_dbg_value_0 + ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: DBG_VALUE internal $vgpr0, 0, 0 + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + DBG_VALUE $vgpr0, 0, 0 + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + +... + +# Middle dbg_value should be bundled +--- +name: bundle_dbg_value_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 + ; GCN-LABEL: name: bundle_dbg_value_1 + ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: DBG_VALUE internal $vgpr0, 0, 0 + ; GCN: DBG_VALUE $vgpr1, 0, 0 + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: DBG_VALUE $vgpr2, 0, 0 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + DBG_VALUE $vgpr0, 0, 0 + DBG_VALUE $vgpr1, 0, 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + DBG_VALUE $vgpr2, 0, 0 +... + +# Starting and ending dbg_values should not be in the bundle +--- +name: bundle_dbg_value_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 + ; GCN-LABEL: name: bundle_dbg_value_2 + ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 + ; GCN: DBG_VALUE $vgpr1, 0, 0 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: DBG_VALUE internal $vgpr0, 0, 0 + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: DBG_VALUE $vgpr2, 0, 0 + DBG_VALUE $vgpr1, 0, 0 + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + DBG_VALUE $vgpr0, 0, 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + DBG_VALUE $vgpr2, 0, 0 +... + +--- +name: bundle_kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: bundle_kill + ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: KILL $vgpr1 + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + KILL $vgpr1 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + +... + +--- +name: bundle_kill_def_in_bundle +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN-LABEL: name: bundle_kill_def_in_bundle + ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: KILL internal $vgpr0 + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + KILL $vgpr0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + +...