Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2941,22 +2941,6 @@ return InsPt; } -// Returns subreg index, offset -static std::pair -computeIndirectRegAndOffset(const SIRegisterInfo &TRI, - const TargetRegisterClass *SuperRC, - unsigned VecReg, - int Offset) { - int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; - - // Skip out of bounds offsets, or else we would end up using an undefined - // register. - if (Offset >= NumElts || Offset < 0) - return std::make_pair(AMDGPU::sub0, Offset); - - return std::make_pair(AMDGPU::sub0 + Offset, 0); -} - // Return true if the index is an SGPR and was set. static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, @@ -3019,7 +3003,6 @@ MachineBasicBlock &MBB, const GCNSubtarget &ST) { const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -3027,12 +3010,11 @@ unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); - - unsigned SubReg; - std::tie(SubReg, Offset) - = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); - + // SubReg should always be AMDGPU::sub0. The 8-bit m0 field for the index + // is unsigned. We can guarantee the index non-negative (if the program + // itself is correct) only when we start from the very first register in + // the vector. + unsigned SubReg = AMDGPU::sub0; bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { @@ -3124,10 +3106,11 @@ // This can be an immediate, but will be folded later. assert(Val->getReg()); - unsigned SubReg; - std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, - SrcVec->getReg(), - Offset); + // SubReg should always be AMDGPU::sub0. The 8-bit m0 field for the index + // is unsigned. We can guarantee the index non-negative (if the program + // itself is correct) only when we start from the very first register in + // the vector. + unsigned SubReg = AMDGPU::sub0; bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (Idx->getReg() == AMDGPU::NoRegister) { Index: test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll @@ -43,10 +43,11 @@ ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc -; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL: s_add_i32 m0, [[READLANE]], 1 ; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 -; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE: s_add_i32 [[IDX:s[0-9]+]], [[READLANE]], 1 +; IDXMODE: s_set_gpr_idx_on [[IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 ; IDXMODE: s_set_gpr_idx_off Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -10,13 +10,14 @@ ; GCN-DAG: s_load_dword [[IN:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 -; MOVREL-DAG: s_mov_b32 m0, [[IN]] +; MOVREL-DAG: s_add_i32 m0, [[IN]], 1 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] -; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}} +; IDXMODE-DAG: s_add_i32 [[IN1:s[0-9]+]], [[IN]], 1 +; IDXMODE: s_set_gpr_idx_on [[IN1]], src0{{$}} ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) { @@ -29,7 +30,8 @@ ; XXX: Could do v_or_b32 directly ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector: -; MOVREL: s_mov_b32 m0 +; MOVREL: s_load_dword [[IDX:s[0-9]+]] +; MOVREL: s_add_i32 m0, [[IDX]], 1 ; GCN-DAG: s_or_b32 ; GCN-DAG: s_or_b32 ; GCN-DAG: s_or_b32 @@ -177,7 +179,7 @@ ; GCN-LABEL: {{^}}insert_w_offset: ; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; MOVREL-DAG: s_mov_b32 m0, [[IN]] +; MOVREL-DAG: s_add_i32 m0, [[IN]], 1 ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 @@ -185,7 +187,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 -; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] +; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: @@ -373,7 +375,7 @@ ; FIXME: Redundant copy ; GCN: s_mov_b64 exec, [[MASK]] -; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]] +; GCN: v_mov_b32_e32 [[VEC_ELT0_2:v[0-9]+]], [[S_ELT0]] ; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec @@ -382,11 +384,12 @@ ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] ; GCN: s_and_saveexec_b64 vcc, vcc -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]] +; MOVREL: s_add_i32 m0, [[READLANE]], 1 +; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] -; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0 -; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]] +; IDXMODE: s_add_i32 [[INDEX:s[0-9]+]], [[READLANE]], 1 +; IDXMODE: s_set_gpr_idx_on [[INDEX]], src0 +; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]] ; IDXMODE: s_set_gpr_idx_off ; GCN-NEXT: s_xor_b64 exec, exec, vcc @@ -492,13 +495,15 @@ ; offset puts outside of superegister bounaries, so clamp to 1st element. ; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}} -; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; MOVREL: s_mov_b32 m0, [[IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] +; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]] +; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]] +; MOVREL: s_add_i32 m0, [[IDX0]], 15 +; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] + +; IDXMODE: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15 ; IDXMODE: s_set_gpr_idx_on [[IDX]], src0 -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]] +; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] ; IDXMODE: s_set_gpr_idx_off ; GCN: buffer_store_dword [[EXTRACT]] @@ -538,12 +543,12 @@ ; GCN-LABEL: {{^}}extractelement_v16i32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN-NOT: [[IDX_SHL]] -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] +; MOVREL: s_add_i32 m0, [[IDX_SHL]], 1 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0 +; IDXMODE: s_add_i32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 +; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], src0 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) { @@ -559,12 +564,12 @@ ; GCN-LABEL: {{^}}insertelement_v16f32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] -; GCN-NOT: [[IDX_SHL]] -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] +; MOVREL: s_add_i32 m0, [[IDX_SHL]], 1 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst +; IDXMODE: s_add_i32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1 +; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {