Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2070,7 +2070,10 @@
       Desc.getNumImplicitUses();
     const unsigned NumImplicitOps = IsDst ? 2 : 1;
 
-    if (MI.getNumOperands() != StaticNumOps + NumImplicitOps) {
+    // Allow additional implicit operands. This allows a fixup done by the post
+    // RA scheduler where the main implicit operand is killed and implicit-defs
+    // are added for sub-registers that remain live after this instruction.
+    if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
       ErrInfo = "missing implicit register operands";
       return false;
     }
Index: test/CodeGen/AMDGPU/movrels-bug.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/movrels-bug.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}main:
+;
+; This used to trigger a malformed instruction because of an interplay between
+; the post-RA register scheduler's fixup of physical register use/kill flags
+; and the insistence that V_MOVRELS has a fixed number of implicit operands.
+;
+; GCN: v_movrels
+define amdgpu_vs void @main(i32 %arg) {
+main_body:
+  %tmp = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %arg)
+  %tmp1 = extractelement <4 x float> %tmp, i32 1
+  %tmp2 = fmul float undef, %tmp1
+  %tmp3 = fadd float %tmp2, undef
+  %tmp4 = fadd float undef, %tmp3
+  %tmp5 = fadd float undef, undef
+  %tmp6 = fmul float undef, undef
+  %tmp7 = fadd float %tmp6, undef
+  %tmp8 = fadd float undef, %tmp4
+  %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 64)
+  %tmp10 = bitcast float %tmp9 to i32
+  %tmp11 = shl i32 %tmp10, 1
+  %tmp12 = or i32 %tmp11, 1
+  %tmp13 = insertelement <6 x float> undef, float 0.000000e+00, i32 %tmp12
+  %tmp14 = extractelement <6 x float> %tmp13, i32 undef
+  %tmp15 = fmul float %tmp14, undef
+  %tmp16 = fadd float undef, %tmp15
+  %tmp17 = fsub float %tmp16, undef
+  %tmp18 = fmul float %tmp17, %tmp17
+  %tmp19 = fadd float %tmp18, undef
+  %tmp20 = fcmp olt float %tmp19, 0x3E312E0BE0000000
+  %. = select i1 %tmp20, float 0.000000e+00, float 1.000000e+00
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %., float undef, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float undef, float %tmp5, float %tmp7, float %tmp8)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }