Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2015,17 +2015,18 @@ let hasNoSchedulingInfo = 1; } -let Uses = [EXEC], Defs = [EXEC, VCC, M0] in { +let Uses = [EXEC], Defs = [EXEC, VCC, M0], + UseNamedOperandTable = 1 in { class SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins rc:$src, VSrc_32:$idx, i32imm:$off) + (outs VGPR_32:$vdst, SReg_64:$sdst), + (ins rc:$src, VSrc_32:$idx, i32imm:$offset) >; class SI_INDIRECT_DST : InstSI < - (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> { - let Constraints = "$src = $dst"; + (outs rc:$vdst, SReg_64:$sdst), + (ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> { + let Constraints = "$src = $vdst"; } // TODO: We can support indirect SGPR access. Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -52,6 +52,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -88,9 +89,15 @@ void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + void splitBlockLiveIns(const MachineBasicBlock &MBB, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + unsigned IdxReg); + void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, - MachineInstr *MovRel, - unsigned SaveReg, unsigned IdxReg, int Offset); + MachineInstr *MovRel, unsigned IdxReg, int Offset); void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); @@ -373,10 +380,41 @@ MI.eraseFromParent(); } +// All currently live registers must remain so in the remainder block. +void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + unsigned IdxReg) { + LivePhysRegs RemainderLiveRegs(TRI); + + RemainderLiveRegs.addLiveOuts(MBB); + for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI); + I != E; ++I) { + RemainderLiveRegs.stepBackward(*I); + } + + // Add reg defined in loop body. + RemainderLiveRegs.addReg(SaveReg); + + if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { + RemainderLiveRegs.addReg(Val->getReg()); + LoopBB.addLiveIn(Val->getReg()); + } + + for (unsigned Reg : RemainderLiveRegs) + RemainderBB.addLiveIn(Reg); + + unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + LoopBB.addLiveIn(SrcReg); + LoopBB.addLiveIn(IdxReg); + LoopBB.sortUniqueLiveIns(); +} + void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, MachineInstr *MovRel, - unsigned SaveReg, unsigned IdxReg, int Offset) { MachineBasicBlock::iterator I = LoopBB.begin(); @@ -418,12 +456,11 @@ } void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { - MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I = MI; + MachineBasicBlock::iterator I(&MI); - unsigned Idx = MI.getOperand(3).getReg(); + unsigned Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx)->getReg(); if (AMDGPU::SReg_32RegClass.contains(Idx)) { if (Offset) { @@ -441,14 +478,16 @@ } MachineFunction &MF = *MBB.getParent(); - unsigned Save = MI.getOperand(1).getReg(); + MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + SaveOp->setIsDead(false); + unsigned Save = SaveOp->getReg(); // Reading from a VGPR requires looping over all workitems in the wavefront. assert(AMDGPU::SReg_64RegClass.contains(Save) && AMDGPU::VGPR_32RegClass.contains(Idx)); // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) .addReg(AMDGPU::EXEC); // To insert the loop we need to split the block. Move everything after this @@ -464,11 +503,14 @@ LoopBB->addSuccessor(LoopBB); LoopBB->addSuccessor(RemainderBB); + if (TRI->trackLivenessAfterRegAlloc(MF)) + splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, Idx); + // Move the rest of the block into a new block. RemainderBB->transferSuccessors(&MBB); RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset); + emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Idx, Offset); MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -510,16 +552,16 @@ DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = MI.getOperand(2).getReg(); - int Off = MI.getOperand(4).getImm(); + unsigned Vec = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; computeIndirectRegAndOffset(Vec, Reg, Off); MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(Vec, RegState::Implicit); + .addReg(Reg) + .addReg(Vec, RegState::Implicit); LoadM0(MI, MovRel, Off); } @@ -530,17 +572,17 @@ DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Val = MI.getOperand(5).getReg(); + int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)->getReg(); unsigned Reg; computeIndirectRegAndOffset(Dst, Reg, Off); MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) - .addReg(Val) - .addReg(Dst, RegState::Implicit); + .addReg(Reg, RegState::Define) + .addReg(Val) + .addReg(Dst, RegState::Implicit); LoadM0(MI, MovRel, Off); } Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -220,9 +220,18 @@ %idx0 = load volatile i32, i32 addrspace(1)* %gep %idx1 = add i32 %idx0, 1 %val0 = extractelement <4 x i32> , i32 %idx0 + %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={SGPR4}" () %val1 = extractelement <4 x i32> , i32 %idx1 store volatile i32 %val0, i32 addrspace(1)* %out0 store volatile i32 %val1, i32 addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.reg, i32 addrspace(1)* undef + br label %bb2 + +bb2: ret void } @@ -230,7 +239,7 @@ ; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} ; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] ; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]] -; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62 +; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 ; CHECK-DAG: s_waitcnt vmcnt(0) ; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec @@ -259,6 +268,8 @@ ; CHECK: s_cbranch_execnz [[LOOP1]] ; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]: + +; CHECK: buffer_store_dword [[INS0]] define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -266,9 +277,18 @@ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext %idx0 = load volatile i32, i32 addrspace(1)* %gep %idx1 = add i32 %idx0, 1 - %vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.val, i32 addrspace(1)* undef + br label %bb2 + +bb2: ret void }