Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1451,6 +1451,23 @@ return LoopBB; } +static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { + switch (VecRC->getSize()) { + case 4: + return AMDGPU::V_MOVRELD_B32_V1; + case 8: + return AMDGPU::V_MOVRELD_B32_V2; + case 16: + return AMDGPU::V_MOVRELD_B32_V4; + case 32: + return AMDGPU::V_MOVRELD_B32_V8; + case 64: + return AMDGPU::V_MOVRELD_B32_V16; + default: + llvm_unreachable("unsupported size for MOVRELD pseudos"); + } +} + static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const SISubtarget &ST) { @@ -1504,20 +1521,13 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { - const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32); - - MachineInstr *MovRel = - BuildMI(MBB, I, DL, MovRelDesc) - .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst - .addOperand(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(SrcVec->getReg(), RegState::Implicit); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); - const int ImpDefIdx = MovRelDesc.getNumOperands() + - MovRelDesc.getNumImplicitUses(); - const int ImpUseIdx = ImpDefIdx + 1; - - MovRel->tieOperands(ImpDefIdx, ImpUseIdx); + BuildMI(MBB, I, DL, MovRelDesc) + .addReg(Dst, RegState::Define) + .addReg(SrcVec->getReg()) + .addOperand(*Val) + .addImm(SubReg - AMDGPU::sub0); } MI.eraseFromParent(); @@ -1555,20 +1565,13 @@ .addReg(PhiReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); } else { - const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32); - // vdst is not actually read and just provides the base register index. - MachineInstr *MovRel = - BuildMI(*LoopBB, InsPt, DL, MovRelDesc) - .addReg(PhiReg, RegState::Undef, SubReg) // vdst - .addOperand(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(PhiReg, RegState::Implicit); - - const int ImpDefIdx = MovRelDesc.getNumOperands() + - MovRelDesc.getNumImplicitUses(); - const int ImpUseIdx = ImpDefIdx + 1; + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); - MovRel->tieOperands(ImpDefIdx, ImpUseIdx); + BuildMI(*LoopBB, InsPt, DL, MovRelDesc) + .addReg(Dst, RegState::Define) + .addReg(PhiReg) + .addOperand(*Val) + .addImm(SubReg - AMDGPU::sub0); } MI.eraseFromParent(); Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -909,6 +909,32 @@ MI.eraseFromParent(); break; } + case AMDGPU::V_MOVRELD_B32_V1: + case AMDGPU::V_MOVRELD_B32_V2: + case AMDGPU::V_MOVRELD_B32_V4: + case AMDGPU::V_MOVRELD_B32_V8: + case AMDGPU::V_MOVRELD_B32_V16: { + const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); + unsigned VecReg = MI.getOperand(0).getReg(); + bool IsUndef = MI.getOperand(1).isUndef(); + unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); + assert(VecReg == MI.getOperand(1).getReg()); + + MachineInstr *MovRel = + BuildMI(MBB, MI, DL, MovRelDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .addOperand(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + + const int ImpDefIdx = + MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); + const int ImpUseIdx = ImpDefIdx + 1; + MovRel->tieOperands(ImpDefIdx, ImpUseIdx); + + MI.eraseFromParent(); + break; + } case AMDGPU::SI_PC_ADD_REL_OFFSET: { MachineFunction &MF = *MBB.getParent(); unsigned Reg = MI.getOperand(0).getReg(); Index: llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td @@ -538,6 +538,26 @@ let VOP1 = 1; } +// This is a pseudo variant of the v_movreld_b32 instruction in which the +// vector operand appears only twice, once as def and once as use. Using this +// pseudo avoids problems with the Two Address instructions pass. +class V_MOVRELD_B32_pseudo : VPseudoInstSI < + (outs rc:$vdst), + (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> { + let VOP1 = 1; + + let Constraints = "$vsrc = $vdst"; + let Uses = [M0, EXEC]; + + let SubtargetPredicate = HasMovrel; +} + +def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo; +def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo; +def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo; +def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo; +def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo; + let Predicates = [isVI] in { def : Pat < Index: llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll +++ llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; GCN: v_movreld_b32_e32 v0, +; GCN: v_mov_b32_e32 v0, v1 +; GCN: ; return +define amdgpu_ps float @main(i32 inreg %arg) #0 { +main_body: + %tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg + %tmp25 = extractelement <2 x float> %tmp24, i32 1 + ret float %tmp25 +} + +attributes #0 = { "InitialPSInputAddr"="36983" }