diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4040,7 +4040,21 @@
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size()) {
-        return Error(getLoc(), "too few operands for instruction");
+        // FIXME: Unlike other targets, we try to use the SMLoc of the "end"
+        // of the instruction here. Everyone else does what we did before
+        // https://reviews.llvm.org/D92084 and points to IDLoc here.
+        //
+        // If this is left as getLoc() then we point to the beginning of the
+        // next (potential) statement, as we now parse the EndOfStatement token
+        // before returning from ParseInstruction.
+        //
+        // I don't see a clean way to recover the last position of the previous
+        // statement. `Operands.back()->getEndLoc() - 1` would also work below,
+        // but we don't properly record the end position for tokens currently
+        // and this seemed equivalent.
+        assert(getLexer().isAtStartOfStatement());
+        return Error(SMLoc::getFromPointer(getToken().getString().data() - 1),
+                     "too few operands for instruction");
       }
       ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
@@ -5020,9 +5034,11 @@
       while (!getLexer().is(AsmToken::EndOfStatement)) {
         Parser.Lex();
       }
+      Parser.Lex();
       return true;
     }
   }
+  Parser.Lex();
 
   return false;
 }
diff --git a/llvm/test/MC/AMDGPU/round-trip.s b/llvm/test/MC/AMDGPU/round-trip.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/round-trip.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -preserve-comments -triple amdgcn-amd-amdhsa %s >%t-1.s
+# RUN: llvm-mc -preserve-comments -triple amdgcn-amd-amdhsa %t-1.s >%t-2.s
+# RUN: diff %t-1.s %t-2.s
+
+# Test that AMDGPU assembly round-trips when run through MC; the first
+# transition from hand-written to "canonical" output may introduce some small
+# differences, so we don't include the initial input in the comparison.
+
+.text
+
+# The AMDGPU asm parser didn't consume the end of statement
+# consistently, which led to extra empty lines in the output.
+s_nop 0