Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -406,19 +406,12 @@ unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI, MachineRegisterInfo &MRI) const; + void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr *MI) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; - /// \brief Split an SMRD instruction into two smaller loads of half the - // size storing the results in \p Lo and \p Hi. - void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const; - - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const; - /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1621,18 +1621,6 @@ case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORD_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1993,6 +1981,20 @@ return DstReg; } +void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + + // If the pointer is store in VGPRs, then we need to move them to + // SGPRs using v_readfirstlane. This is safe because we only select + // loads with uniform pointers to SMRD instruction so we know the + // pointer value is uniform. + MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } +} + void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -2008,6 +2010,12 @@ return; } + // Legalize SMRD + if (isSMRD(*MI)) { + legalizeOperandsSMRD(MRI, MI); + return; + } + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. @@ -2280,219 +2288,6 @@ } } -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addReg(SOff->getReg(), 0, SOff->getSubReg()) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - const TargetRegisterClass *NewDstRC; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - NewDstRC = &AMDGPU::VReg_64RegClass; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - NewDstRC = &AMDGPU::VReg_128RegClass; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - NewDstRC = &AMDGPU::VReg_256RegClass; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - NewDstRC = &AMDGPU::VReg_512RegClass; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - unsigned OldDst = MI->getOperand(0).getReg(); - unsigned NewDst = MRI.createVirtualRegister(NewDstRC); - - MRI.replaceRegWith(OldDst, NewDst); - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, - MachineRegisterInfo &MRI, - SmallVectorImpl &Worklist) const { - MachineBasicBlock *MBB = MI->getParent(); - int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; - switch(RI.getRegClass(DstRCID)->getSize()) { - case 4: - case 8: - case 16: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); - MI->eraseFromParent(); - - legalizeOperands(NewInst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - break; - } - case 32: { - MachineInstr *Lo, *Hi; - addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist); - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - - case 64: { - MachineInstr *Lo, *Hi; - addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist); - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - } -} - void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector Worklist; Worklist.push_back(&TopInst); @@ -2508,10 +2303,6 @@ // Handle some special cases switch (Opcode) { default: - if (isSMRD(*Inst)) { - moveSMRDToVALU(Inst, MRI, Worklist); - continue; - } break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Index: llvm/trunk/test/CodeGen/AMDGPU/missing-store.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/missing-store.ll +++ llvm/trunk/test/CodeGen/AMDGPU/missing-store.ll @@ -8,7 +8,9 @@ ; FUNC-LABEL: {{^}}missing_store_reduced: ; SI: ds_read_b64 ; SI: buffer_store_dword -; SI: buffer_load_dword +; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} ; SI: buffer_store_dword ; SI: s_endpgm define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll +++ llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -53,10 +53,14 @@ ; Test moving an SMRD instruction to the VALU ; GCN-LABEL: {{^}}smrd_valu: -; FIXME: We should be using flat load for HSA. -; GCN: buffer_load_dword [[OUT:v[0-9]+]] -; GCN-NOHSA: buffer_store_dword [[OUT]] -; GCN-HSA: flat_store_dword {{.*}}, [[OUT]] +; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0 +; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]] +; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8 +; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] +; GCN-NOHSA: buffer_store_dword [[V_OUT]] +; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]] define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: %tmp = icmp ne i32 %a, 0 Index: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -70,15 +70,14 @@ ret void } -; Technically we could reorder these, but just comparing the -; instruction type of the load is insufficient. - -; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load -; CI: buffer_load_dword +; FUNC-LABEL: @reorder_constant_load_global_store_constant_load ; CI: buffer_store_dword -; CI: buffer_load_dword +; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 ; CI: buffer_store_dword -define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { +define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 @@ -95,8 +94,10 @@ } ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load -; CI: buffer_load_dword -; CI: buffer_load_dword +; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} +; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 +; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 ; CI: ds_write_b32 ; CI: buffer_store_dword define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {