Index: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4458,6 +4458,11 @@ ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + // All DPP instructions with at least one source operand have a fake "old" + // source at the beginning that's tied to the dst operand. Handle it here. + if (Desc.getNumOperands() >= 2) + Inst.addOperand(Inst.getOperand(0)); + for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments @@ -4480,16 +4485,6 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); - - // special case v_mac_{f16, f32}: - // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || - Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { - auto it = Inst.begin(); - std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); - Inst.insert(it, Inst.getOperand(0)); // src2 = dst - } } //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -1184,8 +1184,9 @@ ); } -class getInsDPP { +class getInsDPP { dag ret = !if (!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1194,26 +1195,29 @@ !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1_DPP with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + (ins DstRC:$old, Src0Mod:$src0_modifiers, + Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, // VOP1_DPP without modifiers - (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, - bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + (ins DstRC:$old, Src0RC:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* endif */) /* NumSrcArgs == 2 */, !if (!eq(HasModifiers, 1), // VOP2_DPP with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + (ins DstRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, // VOP2_DPP without modifiers - (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, - row_mask:$row_mask, bank_mask:$bank_mask, - bound_ctrl:$bound_ctrl) + (ins DstRC:$old, + Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, + row_mask:$row_mask, bank_mask:$bank_mask, + bound_ctrl:$bound_ctrl) /* endif */))); } @@ -1548,7 +1552,7 @@ getOpSelMod.ret, getOpSelMod.ret, getOpSelMod.ret>.ret; - field dag InsDPP = getInsDPP.ret; field dag InsSDWA = getInsSDWA vdst; let Inst{8-0} = 0xfa; // dpp @@ -659,11 +658,11 @@ def : Pat < (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, imm:$bound_ctrl)), - (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), - (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) + (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) >; - def : Pat< (i32 (anyext i16:$src)), (COPY $src) Index: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td @@ -209,9 +209,9 @@ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins DstRCDPP:$old, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -282,7 +282,8 @@ dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins DstRCDPP:$old, + Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -665,8 +666,6 @@ let Uses = ps.Uses; let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; bits<8> vdst; bits<8> src1; Index: llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td @@ -510,6 +510,8 @@ let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; } Index: llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -504,12 +504,12 @@ body: | bb.0: %vgpr0 = V_MOV_B32_e32 0, implicit %exec - %vgpr1 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec + %vgpr1 = V_MOV_B32_dpp %vgpr1, %vgpr0, 0, 15, 15, 0, implicit %exec S_BRANCH %bb.1 bb.1: implicit %exec, implicit %vcc = V_CMPX_EQ_I32_e32 %vgpr0, %vgpr1, implicit %exec - %vgpr3 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec + %vgpr3 = V_MOV_B32_dpp %vgpr3, %vgpr0, 0, 15, 15, 0, implicit %exec S_ENDPGM ... --- Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -5,8 +5,10 @@ ; VI-LABEL: {{^}}dpp_test: ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI-NOOPT: v_mov_b32_e32 v1, s{{[0-9]+}} ; VI: s_nop 1 -; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI-OPT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI-NOOPT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp0, i32 addrspace(1)* %out @@ -14,11 +16,14 @@ } ; VI-LABEL: {{^}}dpp_wait_states: +; VI-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}} ; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} ; VI: s_nop 1 -; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-OPT: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-NOOPT: v_mov_b32_dpp [[VGPR1]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 -; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-OPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0