Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2070,7 +2070,10 @@ Desc.getNumImplicitUses(); const unsigned NumImplicitOps = IsDst ? 2 : 1; - if (MI.getNumOperands() != StaticNumOps + NumImplicitOps) { + // Allow additional implicit operands. This allows a fixup done by the post + // RA scheduler where the main implicit operand is killed and implicit-defs + // are added for sub-registers that remain live after this instruction. + if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { ErrInfo = "missing implicit register operands"; return false; } Index: test/CodeGen/AMDGPU/movrels-bug.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/movrels-bug.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; +; This used to trigger a malformed instruction because of an interplay between +; the post-RA register scheduler's fixup of physical register use/kill flags +; and the insistence that V_MOVRELS has a fixed number of implicit operands. +; +; GCN: v_movrels +define amdgpu_vs void @main(i32 %arg) { +main_body: + %tmp = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %arg) + %tmp1 = extractelement <4 x float> %tmp, i32 1 + %tmp2 = fmul float undef, %tmp1 + %tmp3 = fadd float %tmp2, undef + %tmp4 = fadd float undef, %tmp3 + %tmp5 = fadd float undef, undef + %tmp6 = fmul float undef, undef + %tmp7 = fadd float %tmp6, undef + %tmp8 = fadd float undef, %tmp4 + %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 64) + %tmp10 = bitcast float %tmp9 to i32 + %tmp11 = shl i32 %tmp10, 1 + %tmp12 = or i32 %tmp11, 1 + %tmp13 = insertelement <6 x float> undef, float 0.000000e+00, i32 %tmp12 + %tmp14 = extractelement <6 x float> %tmp13, i32 undef + %tmp15 = fmul float %tmp14, undef + %tmp16 = fadd float undef, %tmp15 + %tmp17 = fsub float %tmp16, undef + %tmp18 = fmul float %tmp17, %tmp17 + %tmp19 = fadd float %tmp18, undef + %tmp20 = fcmp olt float %tmp19, 0x3E312E0BE0000000 + %. = select i1 %tmp20, float 0.000000e+00, float 1.000000e+00 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %., float undef, float 0.000000e+00, float 1.000000e+00) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float undef, float %tmp5, float %tmp7, float %tmp8) + ret void +} + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }