diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -295,6 +295,12 @@
       return;
     }
 
+    if (MI->isMetaInstruction()) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" meta instruction");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -58,6 +58,8 @@
   // Internal instructions, which are allowed in the middle of a hard clause,
   // except for s_waitcnt.
   HARDCLAUSE_INTERNAL,
+  // Pseudo instructions that do not result in any ISA like KILL.
+  HARDCLAUSE_IGNORE,
   // Instructions that are not allowed in a hard clause: SALU, export, branch,
   // message, GDS, s_waitcnt and anything else not mentioned above.
   HARDCLAUSE_ILLEGAL,
@@ -75,7 +77,8 @@
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  HardClauseType getHardClauseType(const MachineInstr &MI) {
+  HardClauseType getHardClauseType(const SIInstrInfo *SII,
+                                   const MachineInstr &MI) {
 
     // On current architectures we only get a benefit from clausing loads.
     if (MI.mayLoad()) {
@@ -100,6 +103,8 @@
     // It's safe to treat the rest as illegal.
     if (MI.getOpcode() == AMDGPU::S_NOP)
       return HARDCLAUSE_INTERNAL;
+    if (MI.isMetaInstruction())
+      return HARDCLAUSE_IGNORE;
     return HARDCLAUSE_ILLEGAL;
   }
 
@@ -119,18 +124,14 @@
   };
 
   bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
-    // Get the size of the clause excluding any internal instructions at the
-    // end.
-    unsigned Size =
-        std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1;
-    if (Size < 2)
+    if (CI.First == CI.Last)
       return false;
-    assert(Size <= 64 && "Hard clause is too long!");
+    assert(CI.Length <= 64 && "Hard clause is too long!");
 
     auto &MBB = *CI.First->getParent();
     auto ClauseMI =
         BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
-            .addImm(Size - 1);
+            .addImm(CI.Length - 1);
     finalizeBundle(MBB, ClauseMI->getIterator(),
                    std::next(CI.Last->getIterator()));
     return true;
@@ -151,7 +152,7 @@
     for (auto &MBB : MF) {
       ClauseInfo CI;
       for (auto &MI : MBB) {
-        HardClauseType Type = getHardClauseType(MI);
+        HardClauseType Type = getHardClauseType(SII, MI);
 
         int64_t Dummy1;
         bool Dummy2;
@@ -168,6 +169,7 @@
 
         if (CI.Length == 64 ||
             (CI.Length && Type != HARDCLAUSE_INTERNAL &&
+             Type != HARDCLAUSE_IGNORE &&
              (Type != CI.Type ||
               // Note that we lie to shouldClusterMemOps about the size of the
               // cluster. When shouldClusterMemOps is called from the machine
@@ -182,10 +184,12 @@
 
         if (CI.Length) {
           // Extend the current clause.
-          ++CI.Length;
-          if (Type != HARDCLAUSE_INTERNAL) {
-            CI.Last = &MI;
-            CI.BaseOps = std::move(BaseOps);
+          if (Type != HARDCLAUSE_IGNORE) {
+            ++CI.Length;
+            if (Type != HARDCLAUSE_INTERNAL) {
+              CI.Last = &MI;
+              CI.BaseOps = std::move(BaseOps);
+            }
           }
         } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
           // Start a new clause.
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -8634,10 +8634,10 @@
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_clause 0x2
 ; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
-; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    ; meta instruction
+; GFX10-SCRATCH-NEXT:    ; meta instruction
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -239,3 +239,22 @@
     $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource")
     $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
 ...
+
+---
+name: kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr4
+    ; CHECK-LABEL: name: kill
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr4
+    ; CHECK: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; CHECK:   S_CLAUSE 1
+    ; CHECK:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; CHECK:   KILL undef renamable $sgpr4
+    ; CHECK:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; CHECK: }
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    KILL undef renamable $sgpr4
+    $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+...