Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2438,7 +2438,8 @@ MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - if (VAddr) { + int Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); + if (Addr64Opcode == -1) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2462,8 +2463,9 @@ .addReg(NewVAddrHi) .addImm(AMDGPU::sub1); } else { - // This instructions is the _OFFSET variant, so we need to convert it to - // ADDR64. + + // This instructions is the _OFFSET or _IDXEN variant, so we need to + // convert it to ADDR64. assert(MBB.getParent()->getSubtarget().getGeneration() < SISubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); @@ -2471,7 +2473,6 @@ MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); // Atomics rith return have have an additional tied operand and are // missing some of the special bits. @@ -2520,15 +2521,40 @@ .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } - MI.removeFromParent(); + const DebugLoc &DL = Addr64->getDebugLoc(); + if (VAddr) { + // This is the _IDXEN variant. Add 32-bit index to the pointer. - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) + unsigned AddLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned AddHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtr:sub0 + vaddr + BuildMI(MBB, Addr64, DL, get(AMDGPU::V_ADD_I32_e32), AddLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, VAddr->getSubReg()); + + // NewVaddrHi = SRsrcPtr:sub1 + carry + BuildMI(MBB, Addr64, DL, get(AMDGPU::V_ADDC_U32_e32), AddHi) + .addImm(0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1); + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(AddLo) + .addImm(AMDGPU::sub0) + .addReg(AddHi) + .addImm(AMDGPU::sub1); + } else { + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, Addr64, DL, get(AMDGPU::REG_SEQUENCE), NewVAddr) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addImm(AMDGPU::sub0) .addReg(SRsrcPtr, 0, AMDGPU::sub1) .addImm(AMDGPU::sub1); + } + + MI.removeFromParent(); VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); Index: test/CodeGen/AMDGPU/move-to-valu-mubuf-idxen.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/move-to-valu-mubuf-idxen.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Broken for VI + +; GCN-LABEL: {{^}}move_to_valu_buffer_load_dword_offen: +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VRSRC0:[0-9]+]]:[[VRSRC1:[0-9]+]]{{\]}} +; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] +; GCN-DAG: s_mov_b64 s{{\[}}[[SRSRC0:[0-9]+]]:{{[0-9]+\]}}, 0{{$}} + +; GCN-DAG: v_add_i32_e32 v[[ADD_LO:[0-9]+]], vcc, [[IDX]], v[[VRSRC0]] +; GCN-DAG: v_addc_u32_e32 v[[ADD_HI:[0-9]+]], vcc, 0, v[[VRSRC1]], vcc + +; GCN: buffer_load_dword [[RESULT:v[0-9]+]], v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}, s{{\[}}[[SRSRC0]]:{{[0-9]+\]}}, 0 addr64 offset:124 +; GCN: buffer_store_dword [[RESULT]] +define void @move_to_valu_buffer_load_dword_offen(float addrspace(1)* %out, <2 x i32> addrspace(1)* %ptr, i32 %idx) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %ptr, i64 %tid.ext + %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %rsrc.ptr = load <2 x i32>, <2 x i32> addrspace(1)* %gep + %vgpr.rsrc = shufflevector <2 x i32> %rsrc.ptr, <2 x i32> zeroinitializer, <4 x i32> + ;load <4 x i32>, <4 x i32> addrspace(1)* %gep + %load = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %vgpr.rsrc, i32 %idx, i32 124, i1 0, i1 0) + store float %load, float addrspace(1)* %gep.out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone }