diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -194,7 +194,6 @@ SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; - SDNode *glueCopyToM0LDSInit(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -617,20 +616,6 @@ return glueCopyToOp(N, M0, M0.getValue(1)); } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { - unsigned AS = cast(N)->getAddressSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS) { - if (Subtarget->ldsRequiresM0Init()) - return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); - } else if (AS == AMDGPUAS::REGION_ADDRESS) { - MachineFunction &MF = CurDAG->getMachineFunction(); - unsigned Value = MF.getInfo()->getGDSSize(); - return - glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); - } - return N; -} - MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, EVT VT) const { SDNode *Lo = CurDAG->getMachineNode( @@ -709,18 +694,6 @@ return; // Already selected. } - // isa almost works but is slightly too permissive for some DS - // intrinsics. - if (Opc == ISD::LOAD || Opc == ISD::STORE || isa(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || - Opc == ISD::ATOMIC_LOAD_FADD || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { - N = glueCopyToM0LDSInit(N); - SelectCode(N); - return; - } - switch (Opc) { default: break; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -50,6 +50,9 @@ bits<1> has_m0_read = 1; let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]); + + // initialize M0 in SITargetLowering::AdjustInstrPostInstrSelection + let hasPostISelHook = has_m0_read; } class DS_Real : @@ -326,6 +329,9 @@ let has_gds = 0; let gdsValue = 1; let AsmMatchConverter = "cvtDSGds"; + + // Disable M0 default initialization + let hasPostISelHook = 0; } class DS_GWS_0D @@ -565,9 +571,11 @@ } // End mayStore = 0 +let hasPostISelHook = 0 in { // Disable default M0 initialization def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; +} //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -221,6 +221,8 @@ SDLoc DL, SDValue Ops[], MemSDNode *M) const; + void initDefaultM0(MachineInstr &MI) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10787,6 +10787,61 @@ return Node; } +// These instructions specify M0 value during selection, +// see AMDGPUDAGToDAGISel::glueCopyToM0, SITargetLowering::copyToM0 usage +static bool hasNonDefaultM0(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::DS_APPEND: + case AMDGPU::DS_CONSUME: + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_BARRIER: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: + case AMDGPU::DS_ORDERED_COUNT: + return true; + default: break; + } + return false; +} + +// Faster version of readsRegister: M0 is usually the first implicit +static bool readsM0(MachineInstr &MI) { + for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands(); + I != E; ++I) { + auto &MO = MI.getOperand(I); + if (MO.isReg() && MO.isUse() && MO.getReg() == AMDGPU::M0) + return true; + } + return false; +} + +void SITargetLowering::initDefaultM0(MachineInstr &MI) const { + auto &TII = *Subtarget->getInstrInfo(); + assert(TII.isDS(MI) && MI.getNumMemOperands()); + + auto AS = (*MI.memoperands_begin())->getAddrSpace(); + unsigned M0Init; + if (AS == AMDGPUAS::REGION_ADDRESS && + TII.getNamedOperand(MI, AMDGPU::OpName::gds)->getImm() != 0) { + auto &MF = *MI.getParent()->getParent(); + M0Init = MF.getInfo()->getGDSSize(); + } else if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->ldsRequiresM0Init()) + M0Init = -1; + else + return; + + // hasPostISelHook should be disabled for the special cases + assert(!hasNonDefaultM0(MI.getOpcode())); + + if (!readsM0(MI)) + return; + + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0).addImm(M0Init); +} + /// Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, @@ -10795,6 +10850,9 @@ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (TII->isDS(MI) && MI.getNumMemOperands()) + initDefaultM0(MI); + if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -634,12 +634,10 @@ } ; GCN-LABEL: {{^}}ds_read_interp_read: -; CI: s_mov_b32 m0, -1 -; CI: ds_read_b32 ; CI: s_mov_b32 m0, s0 ; CI: v_interp_mov_f32 ; CI: s_mov_b32 m0, -1 -; CI: ds_read_b32 +; CI: ds_read2_b32 v[0:1], v0 offset1:4 ; GFX9: ds_read2_b32 v[0:1], v0 offset1:4 ; GFX9: s_mov_b32 m0, s0 ; GFX9: v_interp_mov_f32 diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll @@ -4,8 +4,8 @@ ; an unused stack slot, causing ScratchSize to be non-zero. ; GCN-LABEL: store_v3i32: -; GCN: ds_read_b32 ; GCN: ds_read_b64 +; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: ds_write_b64 ; GCN: ScratchSize: 0 @@ -17,8 +17,8 @@ } ; GCN-LABEL: store_v5i32: -; GCN: ds_read_b32 ; GCN: ds_read_b128 +; GCN: ds_read_b32 ; GCN: ds_write_b32 ; GCN: ds_write_b128 ; GCN: ScratchSize: 0