diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -295,6 +295,12 @@ return; } + if (MI->isMetaInstruction()) { + if (isVerbose()) + OutStreamer->emitRawComment(" meta instruction"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -58,6 +58,8 @@ // Internal instructions, which are allowed in the middle of a hard clause, // except for s_waitcnt. HARDCLAUSE_INTERNAL, + // Pseudo instructions that do not result in any ISA like KILL. + HARDCLAUSE_IGNORE, // Instructions that are not allowed in a hard clause: SALU, export, branch, // message, GDS, s_waitcnt and anything else not mentioned above. HARDCLAUSE_ILLEGAL, @@ -75,7 +77,8 @@ MachineFunctionPass::getAnalysisUsage(AU); } - HardClauseType getHardClauseType(const MachineInstr &MI) { + HardClauseType getHardClauseType(const SIInstrInfo *SII, + const MachineInstr &MI) { // On current architectures we only get a benefit from clausing loads. if (MI.mayLoad()) { @@ -100,6 +103,8 @@ // It's safe to treat the rest as illegal. if (MI.getOpcode() == AMDGPU::S_NOP) return HARDCLAUSE_INTERNAL; + if (MI.isMetaInstruction()) + return HARDCLAUSE_IGNORE; return HARDCLAUSE_ILLEGAL; } @@ -119,18 +124,14 @@ }; bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { - // Get the size of the clause excluding any internal instructions at the - // end. - unsigned Size = - std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1; - if (Size < 2) + if (CI.First == CI.Last) return false; - assert(Size <= 64 && "Hard clause is too long!"); + assert(CI.Length <= 64 && "Hard clause is too long!"); auto &MBB = *CI.First->getParent(); auto ClauseMI = BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE)) - .addImm(Size - 1); + .addImm(CI.Length - 1); finalizeBundle(MBB, ClauseMI->getIterator(), std::next(CI.Last->getIterator())); return true; @@ -151,7 +152,7 @@ for (auto &MBB : MF) { ClauseInfo CI; for (auto &MI : MBB) { - HardClauseType Type = getHardClauseType(MI); + HardClauseType Type = getHardClauseType(SII, MI); int64_t Dummy1; bool Dummy2; @@ -168,6 +169,7 @@ if (CI.Length == 64 || (CI.Length && Type != HARDCLAUSE_INTERNAL && + Type != HARDCLAUSE_IGNORE && (Type != CI.Type || // Note that we lie to shouldClusterMemOps about the size of the // cluster. When shouldClusterMemOps is called from the machine @@ -182,10 +184,12 @@ if (CI.Length) { // Extend the current clause. - ++CI.Length; - if (Type != HARDCLAUSE_INTERNAL) { - CI.Last = &MI; - CI.BaseOps = std::move(BaseOps); + if (Type != HARDCLAUSE_IGNORE) { + ++CI.Length; + if (Type != HARDCLAUSE_INTERNAL) { + CI.Last = &MI; + CI.BaseOps = std::move(BaseOps); + } } } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { // Start a new clause. diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -8634,10 +8634,10 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x2 ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: ; meta instruction ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir --- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir @@ -239,3 +239,22 @@ $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ... + +--- +name: kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr4 + ; CHECK-LABEL: name: kill + ; CHECK: liveins: $sgpr0_sgpr1, $sgpr4 + ; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 { + ; CHECK: S_CLAUSE 1 + ; CHECK: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; CHECK: KILL undef renamable $sgpr4 + ; CHECK: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 + ; CHECK: } + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + KILL undef renamable $sgpr4 + $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0 +...