Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2319,6 +2319,29 @@ .addImm(SubHi); } + +static int getFlatOpFromSMRD(uint16_t Opcode) { + switch (Opcode) { + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORD_IMM_ci: + return AMDGPU::FLAT_LOAD_DWORD; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM_ci: + return AMDGPU::FLAT_LOAD_DWORDX2; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM_ci: + return AMDGPU::FLAT_LOAD_DWORDX4; + default: + llvm_unreachable("invalid SMRD opcode"); + return -1; + }; + + return -1; +} + void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, SmallVectorImpl &Worklist) const { @@ -2330,77 +2353,156 @@ case 4: case 8: case 16: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewInst = NULL; + unsigned NewDstReg; + + const MachineOperand *DstOpnd = getNamedOperand(*MI, AMDGPU::OpName::dst); + const MachineOperand *SBaseOpnd = getNamedOperand(*MI, AMDGPU::OpName::sbase); + const MachineOperand *OffsetOpnd = getNamedOperand(*MI, AMDGPU::OpName::offset); + unsigned DstReg = DstOpnd->getReg(); + + if (ST.useFlatForGlobal() || + ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + unsigned FlatOpcode = getFlatOpFromSMRD(MI->getOpcode()); + const MCInstrDesc &NewInstDesc = get(FlatOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + + MRI.replaceRegWith(DstReg, NewDstReg); + + unsigned ImmOffset = 0; + unsigned RegOffset; + bool HasOffset = false; + + if (OffsetOpnd->isReg()) { + RegOffset = OffsetOpnd->getReg(); + HasOffset = true; + } else { + assert(OffsetOpnd->isImm()); + ImmOffset = OffsetOpnd->getImm(); + // SMRD instructions take a dword offsets on SI/CI and byte offset on VI + // and FLAT instructions always take a byte offset. + if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + HasOffset = (ImmOffset != 0); + if (ImmOffset > 64) { // Could not be inlined! + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; + // Compute VAddr for the flat load. + unsigned VAddr; + if ( HasOffset ) { // VAddr = SBase + Offset; + unsigned VAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned VAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + VAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + if (ImmOffset == 0) { + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), VAddrLo) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBaseOpnd->getReg(), 0, AMDGPU::sub0) + .addReg(RegOffset); + } else { + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), VAddrLo) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBaseOpnd->getReg(), 0, AMDGPU::sub0) + .addImm(ImmOffset); + } + + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), VAddrHi) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBaseOpnd->getReg(), 0, AMDGPU::sub1) + .addImm(0) + .addReg(AMDGPU::VCC); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), VAddr) + .addReg(VAddrLo) + .addImm(AMDGPU::sub0) + .addReg(VAddrHi) + .addImm(AMDGPU::sub1); + } else { // VAddr = SBase; + VAddr = SBaseOpnd->getReg(); + } + // Build a flat load. + NewInst = BuildMI(*MBB, MI, DL, NewInstDesc, NewDstReg) + .addReg(VAddr) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (OffsetOpnd->isReg()) { + RegOffset = OffsetOpnd->getReg(); ImmOffset = 0; + } else { + assert(OffsetOpnd->isImm()); + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = OffsetOpnd->getImm(); + if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } } - } - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord2) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord3) + .addImm(RsrcDataFormat >> 32); + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + + NewInst = BuildMI(*MBB, MI, DL, NewInstDesc, NewDstReg) + .addOperand(*SBaseOpnd) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); MI->eraseFromParent(); - legalizeOperands(NewInst); addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); break; Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -53,8 +53,8 @@ ; Test moving an SMRD instruction to the VALU ; GCN-LABEL: {{^}}smrd_valu: -; FIXME: We should be using flat load for HSA. -; GCN: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-NOHSA: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-HSA: flat_load_dword [[OUT:v[0-9]+]] ; GCN-NOHSA: buffer_store_dword [[OUT]] ; GCN-HSA: flat_store_dword [[OUT]] define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { Index: test/CodeGen/AMDGPU/smrd-to-flat.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/smrd-to-flat.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NOHSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s + +; Test moving an SMRD instruction to the VALU with inlineable immediate. + +; GCN-LABEL: {{^}}smrd_valu_inline_imm: + +; GCN-HSA: v_add_i32_e32 v{{[0-9]}}, vcc, 16 +; GCN-HSA: flat_load_dword [[OUT:v[0-9]+]] +; GCN-NOHSA: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-HSA: flat_store_dword [[OUT]] +; GCN-NOHSA: buffer_store_dword [[OUT]] +define void @smrd_valu_inline_imm(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { +entry: + %tmp = icmp ne i32 %a, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + br label %endif + +else: ; preds = %entry + %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2 + br label %endif + +endif: ; preds = %else, %if + %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ] + %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 4 + %tmp6 = load i32, i32 addrspace(2)* %tmp5 + store i32 %tmp6, i32 addrspace(1)* %out + ret void +} + + +; Test moving an SMRD instruction to the VALU with large immediate. + +; GCN-LABEL: {{^}}smrd_valu_large_imm: + +; GCN-HSA: s_movk_i32 s{{[0-9]}}, 0x80 +; GCN-HSA: flat_load_dword [[OUT:v[0-9]+]] +; GCN-NOHSA: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-HSA: flat_store_dword [[OUT]] +; GCN-NOHSA: buffer_store_dword [[OUT]] +define void @smrd_valu_large_imm(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { +entry: + %tmp = icmp ne i32 %a, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + br label %endif + +else: ; preds = %entry + %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2 + br label %endif + +endif: ; preds = %else, %if + %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ] + %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 32 + %tmp6 = load i32, i32 addrspace(2)* %tmp5 + store i32 %tmp6, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }