Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -139,6 +139,10 @@ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; + bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base, + SDValue &Offset, bool IsInsert) const; + bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -1251,6 +1255,51 @@ !isa(Offset); } +bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index, + SDValue &Base, + SDValue &Offset, + bool IsInsert) const { + SDLoc DL(Index); + + if (CurDAG->isBaseWithConstantOffset(Index)) { + SDValue N0 = Index.getOperand(0); + SDValue N1 = Index.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + // (add n0, c0) + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; + } + + if (IsInsert) { + if (ConstantSDNode *CBase = dyn_cast(Index)) { + Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32); + Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32); + return true; + } + } else { + if (isa(Index)) + return false; + } + + Base = Index; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + return selectMOVRELOffsetImpl(Index, Base, Offset, false); +} + +bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + return selectMOVRELOffsetImpl(Index, Base, Offset, true); +} + SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width) { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1713,7 +1713,8 @@ if (RegClass != -1) { unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Reg == AMDGPU::NoRegister || + TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -611,6 +611,9 @@ def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgpr : ComplexPattern; +def MOVRELSOffset : ComplexPattern; +def MOVRELDOffset : ComplexPattern; + def VOP3Mods0 : ComplexPattern; def VOP3NoMods0 : ComplexPattern; def VOP3Mods0Clamp : ComplexPattern; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2026,12 +2026,11 @@ class SI_INDIRECT_SRC : InstSI < (outs VGPR_32:$vdst, SReg_64:$sdst), - (ins rc:$src, VSrc_32:$idx, i32imm:$offset) ->; + (ins rc:$src, VS_32:$idx, i32imm:$offset)>; class SI_INDIRECT_DST : InstSI < (outs rc:$vdst, SReg_64:$sdst), - (ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> { + (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { let Constraints = "$src = $vdst"; } @@ -3313,29 +3312,16 @@ /********** ====================== **********/ multiclass SI_INDIRECT_Pattern { - - // 1. Extract with offset - def : Pat< - (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), - (!cast("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) - >; - - // 2. Extract without offset - def : Pat< - (eltvt (extractelt vt:$vec, i32:$idx)), - (!cast("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) - >; - - // 3. Insert with offset + // Extract with offset def : Pat< - (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (!cast("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) + (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))), + (!cast("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; - // 4. Insert without offset + // Insert with offset def : Pat< - (insertelt vt:$vec, eltvt:$val, i32:$idx), - (!cast("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) + (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))), + (!cast("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -448,7 +448,7 @@ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); - if (Offset) { + if (Offset != 0) { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) .addReg(AMDGPU::M0) .addImm(Offset); @@ -476,7 +476,7 @@ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { - if (Offset) { + if (Offset != 0) { BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) .addImm(Offset); @@ -532,16 +532,17 @@ return true; } -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. +/// \param @VecReg The register which holds element zero of the vector being +/// addressed into. +// +/// \param[in] @Idx The index operand from the movrel instruction. This must be +// a register, but may be NoRegister. +/// +/// \param[in] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant +// value that needs to be added to the value stored in M0. std::pair -SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, - int Offset) const { +SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); if (!SubReg) SubReg = VecReg; @@ -572,42 +573,59 @@ // Return true if a new block was inserted. bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); - int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(Reg, getUndefRegState(Idx->isUndef())); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(Reg, getUndefRegState(SrcVec->isUndef())) .addReg(SrcVec->getReg(), RegState::Implicit); - return loadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } // Return true if a new block was inserted. bool SILowerControlFlow::indirectDst(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); + + MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) + .addOperand(*Val); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) .addReg(Dst, RegState::Implicit); - return loadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -452,6 +452,39 @@ ret void } +; Test that the or is folded into the base address register instead of +; added to m0 + +; GCN-LABEL: {{^}}extractelement_v4i32_or_index: +; GCN: s_load_dword [[IDX_IN:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] +; GCN-NOT: [[IDX_SHL]] +; GCN: s_mov_b32 m0, [[IDX_SHL]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +entry: + %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %idx.shl = shl i32 %idx.in, 2 + %idx = or i32 %idx.shl, 1 + %value = extractelement <4 x i32> %ld, i32 %idx + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}insertelement_v4f32_or_index: +; GCN: s_load_dword [[IDX_IN:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] +; GCN-NOT: [[IDX_SHL]] +; GCN: s_mov_b32 m0, [[IDX_SHL]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { + %idx.shl = shl i32 %idx.in, 2 + %idx = or i32 %idx.shl, 1 + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -346,9 +346,9 @@ ; FIXME: Should be able to manipulate m0 directly instead of add and ; copy. -; GCN: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1 +; FIXME: Should avoid resetting m0 to same value ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN-DAG: s_mov_b32 m0, [[IDX1]] +; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] ; GCN: buffer_store_dwordx4