Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2319,6 +2319,34 @@ .addImm(SubHi); } + +// We may reduce this list. +static int getFlatInst(uint16_t Opcode) { + switch (Opcode) { + + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: return AMDGPU::FLAT_LOAD_DWORDX2; + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: return AMDGPU::FLAT_LOAD_DWORDX4; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: return AMDGPU::FLAT_LOAD_DWORD; + case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: return AMDGPU::FLAT_LOAD_SBYTE; + case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: return AMDGPU::FLAT_LOAD_SSHORT; + case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: return AMDGPU::FLAT_LOAD_UBYTE; + case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: return AMDGPU::FLAT_LOAD_USHORT; + + case AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64: return AMDGPU::FLAT_LOAD_DWORDX2; + case AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64: return AMDGPU::FLAT_LOAD_DWORDX4; + case AMDGPU::BUFFER_LOAD_DWORD_ADDR64: return AMDGPU::FLAT_LOAD_DWORD; + case AMDGPU::BUFFER_LOAD_SBYTE_ADDR64: return AMDGPU::FLAT_LOAD_SBYTE; + case AMDGPU::BUFFER_LOAD_SSHORT_ADDR64: return AMDGPU::FLAT_LOAD_SSHORT; + case AMDGPU::BUFFER_LOAD_UBYTE_ADDR64: return AMDGPU::FLAT_LOAD_UBYTE; + case AMDGPU::BUFFER_LOAD_USHORT_ADDR64: return AMDGPU::FLAT_LOAD_USHORT; + + default: llvm_unreachable("invalid mubuf opcode"); return -1; + }; + + return -1; +} + + void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, SmallVectorImpl &Worklist) const { @@ -2331,76 +2359,151 @@ case 8: case 16: { unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewInst = NULL; + unsigned NewDstReg; + + if (ST.useFlatForGlobal()) { + unsigned FlatOpcode = getFlatInst(NewOpcode); + const MCInstrDesc &NewInstDesc = get(FlatOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); + MRI.replaceRegWith(DstReg, NewDstReg); + + MachineOperand SBase = MI->getOperand(1); + + unsigned ImmOffset = 0; + unsigned RegOffset; + bool HasOffset = false; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + HasOffset = true; + } else { + assert(MI->getOperand(2).isImm()); + ImmOffset = MI->getOperand(2).getImm(); + HasOffset = (ImmOffset != 0); + if (!isUInt<6>(ImmOffset)) { + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; + // Compute VAddr for the flat load. + unsigned VAddr; + if ( HasOffset ) { // VAddr = SBase + Offset; + unsigned VAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned VAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + VAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + if (ImmOffset == 0) { + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), VAddrLo) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBase.getReg(), 0, AMDGPU::sub0) + .addReg(RegOffset); + } else { + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADD_I32_e64), VAddrLo) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBase.getReg(), 0, AMDGPU::sub0) + .addImm(ImmOffset); + } + + BuildMI(*MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), VAddrHi) + .addReg(AMDGPU::VCC, RegState::Define) + .addReg(SBase.getReg(), 0, AMDGPU::sub1) + .addImm(0) + .addReg(AMDGPU::VCC); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), VAddr) + .addReg(VAddrLo) + .addImm(AMDGPU::sub0) + .addReg(VAddrHi) + .addImm(AMDGPU::sub1); + } else { // VAddr = SBase; + VAddr = SBase.getReg(); + } + // Build a flat load. + NewInst = BuildMI(*MBB, MI, DL, NewInstDesc, NewDstReg) + .addReg(VAddr) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (MBB->getParent()->getSubtarget().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } } - } - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord2) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), DWord3) + .addImm(RsrcDataFormat >> 32); + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); + MRI.replaceRegWith(DstReg, NewDstReg); + + NewInst = BuildMI(*MBB, MI, DL, NewInstDesc, NewDstReg) + .addOperand(MI->getOperand(1)) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); MI->eraseFromParent(); - legalizeOperands(NewInst); addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); break; Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -53,8 +53,8 @@ ; Test moving an SMRD instruction to the VALU ; GCN-LABEL: {{^}}smrd_valu: -; FIXME: We should be using flat load for HSA. -; GCN: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-NOHSA: buffer_load_dword [[OUT:v[0-9]+]] +; GCN-HSA: flat_load_dword [[OUT:v[0-9]+]] ; GCN-NOHSA: buffer_store_dword [[OUT]] ; GCN-HSA: flat_store_dword [[OUT]] define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {