Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -90,8 +90,6 @@ bool isConstantLoad(const MemSDNode *N, int cbID) const; bool isUniformBr(const SDNode *N) const; - SDNode *glueCopyToM0(SDNode *N) const; - const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, @@ -263,31 +261,6 @@ } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) - return N; - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - // Write max value to m0 before each load operation - - SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); - - SDValue Glue = M0.getValue(1); - - SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } - Ops.push_back(Glue); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); - - return N; -} - static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: @@ -312,10 +285,6 @@ return; // Already selected. } - if (isa(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) - N = glueCopyToM0(N); - switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during @@ -479,12 +448,6 @@ N->getValueType(0), Ops)); return; } - case ISD::LOAD: - case ISD::STORE: { - N = glueCopyToM0(N); - break; - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -313,6 +313,12 @@ CVT_F32_UBYTE1, CVT_F32_UBYTE2, CVT_F32_UBYTE3, + + // These are the same as the standard nodes except they have glue input/output + // for the implicit m0 use. + INSERT_VECTOR_ELT_INDIRECT, + EXTRACT_VECTOR_ELT_INDIRECT, + /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3438,6 +3438,8 @@ NODE_NAME_CASE(CVT_F32_UBYTE1) NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(INSERT_VECTOR_ELT_INDIRECT) + NODE_NAME_CASE(EXTRACT_VECTOR_ELT_INDIRECT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -44,6 +44,14 @@ //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes + +def extractelt_indirect : SDNode<"AMDGPUISD::EXTRACT_VECTOR_ELT_INDIRECT", SDTVecExtract, + [SDNPOutGlue] +>; +def insertelt_indirect : SDNode<"AMDGPUISD::INSERT_VECTOR_ELT_INDIRECT", SDTVecInsert, + [SDNPOutGlue] +>; + // def AMDGPUconstdata_ptr : SDNode< @@ -257,15 +265,15 @@ def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT", SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue]>; + [SDNPInGlue, SDNPOutGlue]>; def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", SDTypeProfile<1, 3, [SDTCisFP<0>]>, @@ -273,7 +281,7 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", SDTypeProfile<1, 4, [SDTCisFP<0>]>, - [SDNPInGlue]>; + [SDNPInGlue, SDNPOutGlue]>; def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -76,8 +76,8 @@ MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); if (Size != 64) return false; @@ -153,8 +153,8 @@ assert(Size == 64); DebugLoc DL = I.getDebugLoc(); - unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const APInt &Imm = I.getOperand(1).getCImm()->getValue(); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg) @@ -337,7 +337,7 @@ if (isUInt<32>(GEPInfo.Imm)) { Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(GEPInfo.Imm); Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -68,6 +68,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -327,19 +328,50 @@ return true; } +static bool usesDefaultM0Value(const SIInstrInfo &TII, const MachineInstr &MI) { + return TII.isDS(MI) || TII.mayAccessFlatAddressSpace(MI); +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + MDT = &getAnalysis(); - SmallVector Worklist; + MachineBasicBlock &Entry = *MF.begin(); + for (const MachineInstr &MI : MRI.use_instructions(AMDGPU::M0)) { + if (usesDefaultM0Value(*TII, MI)) { + MFI->setNeedsM0Intialization(); + break; + } + } + + bool NeedM0 = false; + if (MFI->needsM0Initialization()) { + NeedM0 = true; + TII->emitSetM0ToDefaultValue(Entry, Entry.begin(), DebugLoc()); + } else { + NeedM0 = MRI.isPhysRegUsed(AMDGPU::M0); + if (NeedM0) { + BuildMI(Entry, Entry.begin(), DebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), AMDGPU::M0); + } + } + + SmallVector Worklist; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; + if (NeedM0 && (&MBB != &Entry)) { + assert(!MBB.isLiveIn(AMDGPU::M0)); + MBB.addLiveIn(AMDGPU::M0); + } + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineInstr &MI = *I; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -197,8 +197,23 @@ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; ConstraintType getConstraintType(StringRef Constraint) const override; - SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, - SDValue V) const; + + SDValue getNodeWithM0Use(SelectionDAG &DAG, unsigned Opc, + const SDLoc &SL, EVT VT, + SDValue Chain, ArrayRef Ops, + SDValue M0Val) const; + + SDValue getNodeWithM0SaveRestore(SelectionDAG &DAG, unsigned Opc, + const SDLoc &SL, EVT VT, + ArrayRef Ops) const; + + SDValue getDefaultM0Value(SelectionDAG &DAG, const SDLoc &SL) const; + + SDValue getNodeWithM0UseRestoreDefault(SelectionDAG &DAG, unsigned Opc, + const SDLoc &SL, EVT VT, + SDValue Chain, ArrayRef Ops, + SDValue M0Val = SDValue()) const; + }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -208,8 +208,8 @@ case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: - case ISD::EXTRACT_VECTOR_ELT: - case ISD::INSERT_VECTOR_ELT: + //case ISD::EXTRACT_VECTOR_ELT: + //case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: case ISD::SCALAR_TO_VECTOR: @@ -224,6 +224,28 @@ } } +#if 1 + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i32, Custom); +#endif + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that // is expanded to avoid having two separate loops in case the index is a VGPR. @@ -1565,6 +1587,10 @@ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + TII->emitSetM0ToDefaultValue(MBB, std::next(I), DL); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); @@ -1589,9 +1615,6 @@ return &MBB; } - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I(&MI); - unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -1682,6 +1705,11 @@ return &MBB; } + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); + + TII->emitSetM0ToDefaultValue(MBB, std::next(I), DL); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); @@ -1712,8 +1740,6 @@ if (Val->isReg()) MRI.clearKillFlags(Val->getReg()); - const DebugLoc &DL = MI.getDebugLoc(); - if (UseGPRIdxMode) { MachineBasicBlock::iterator I(&MI); @@ -2005,13 +2031,17 @@ SelectionDAG &DAG) const { switch (N->getOpcode()) { case ISD::INSERT_VECTOR_ELT: { - if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) - Results.push_back(Res); + if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) { + if (Res.getNode() != N) + Results.push_back(Res); + } return; } case ISD::EXTRACT_VECTOR_ELT: { - if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) - Results.push_back(Res); + if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) { + if (Res.getNode() != N) + Results.push_back(Res); + } return; } default: @@ -2236,9 +2266,10 @@ SelectionDAG &DAG) const { if (Subtarget->hasApertureRegs()) { // Read from Aperture Registers directly. - unsigned RegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : - AMDGPU::SRC_PRIVATE_BASE; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, RegNo, MVT::i32); + unsigned Reg = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : + AMDGPU::SRC_PRIVATE_BASE; + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, + Reg, MVT::i32); } SDLoc SL; @@ -2324,11 +2355,19 @@ SelectionDAG &DAG) const { SDValue Idx = Op.getOperand(2); if (isa(Idx)) - return SDValue(); + return Op; - // Avoid stack access for dynamic indexing. SDLoc SL(Op); + EVT ResultVT = Op.getValueType(); SDValue Vec = Op.getOperand(0); + + if (Vec.getValueType().getSizeInBits() != 32) { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1), Op.getOperand(2) }; + return getNodeWithM0UseRestoreDefault(DAG, AMDGPUISD::INSERT_VECTOR_ELT_INDIRECT, + SL, ResultVT, SDValue(), Ops); + } + + // Avoid stack access for dynamic indexing. SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec @@ -2361,8 +2400,11 @@ SDValue Idx = Op.getOperand(1); if (const ConstantSDNode *CIdx = dyn_cast(Idx)) { - SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + if (Vec.getValueType().getSizeInBits() != 32) { + return Op; + } + SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); if (CIdx->getZExtValue() == 1) { Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, DAG.getConstant(16, SL, MVT::i32)); @@ -2375,6 +2417,12 @@ return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); } + if (Vec.getValueType().getSizeInBits() != 32) { + SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; + return getNodeWithM0UseRestoreDefault(DAG, AMDGPUISD::EXTRACT_VECTOR_ELT_INDIRECT, + SL, ResultVT, SDValue(), Ops); + } + SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); // Convert vector index to bit-index. @@ -2474,8 +2522,18 @@ MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, - const SDLoc &DL, SDValue V) const { +SDValue SITargetLowering::getNodeWithM0Use(SelectionDAG &DAG, unsigned Opc, + const SDLoc &SL, EVT VT, + SDValue InputChain, + ArrayRef Ops, + SDValue M0Val) const { + SDValue Chain = InputChain ? InputChain : DAG.getEntryNode(); + SDValue OrigM0 = DAG.getCopyFromReg(Chain, SL, AMDGPU::M0, MVT::i32, SDValue()); + +#if 1 + SDValue NewM0 = DAG.getCopyToReg(OrigM0.getValue(1), SL, AMDGPU::M0, + M0Val, OrigM0.getValue(2)); +#else // We can't use S_MOV_B32 directly, because there is no way to specify m0 as // the destination register. // @@ -2485,9 +2543,132 @@ // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. // A Null SDValue creates a glue result. - SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, - V, Chain); - return SDValue(M0, 0); + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, SL, MVT::Other, MVT::Glue, + M0Val, Chain, OrigM0.getValue(2)); + SDValue NewM0(M0, 0); + +#endif + + SDVTList VTList = (InputChain && VT != MVT::Other) ? + DAG.getVTList(VT, MVT::Other, MVT::Glue) : DAG.getVTList(VT, MVT::Glue); + + SmallVector NodeOps; + + if (InputChain) + NodeOps.push_back(NewM0); // Chain + + NodeOps.append(Ops.begin(), Ops.end()); // Operands + NodeOps.push_back(NewM0.getValue(1)); // Glue + + SDValue Node = DAG.getNode(Opc, SL, VTList, NodeOps); + + SDValue OutChain = InputChain ? Node : NewM0; + SDValue OutGlue = Node.getValue(Node->getNumValues() - 1); +#if 1 + SDValue RestoreM0 = DAG.getCopyToReg(OutChain, SL, AMDGPU::M0, + OrigM0, OutGlue); +#else + SDNode *InitM0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, SL, MVT::Other, MVT::Glue, + OrigM0, OutChain, OutGlue); + SDValue RestoreM0(InitM0, 0); +#endif + if (InputChain) + return RestoreM0; + + SDValue NewRoot = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + RestoreM0, DAG.getRoot()); + DAG.setRoot(NewRoot); + return Node; +} + +SDValue SITargetLowering::getNodeWithM0SaveRestore(SelectionDAG &DAG, unsigned Opc, + const SDLoc &SL, EVT VT, + ArrayRef Ops) const { + SDValue SaveM0 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::M0, MVT::i32, + SDValue()); + + SmallVector OpsWithGlue(Ops.begin(), Ops.end()); + OpsWithGlue.push_back(SaveM0.getValue(2)); + + SDVTList VTs = DAG.getVTList(VT, MVT::Glue); + SDValue Op = DAG.getNode(Opc, SL, VTs, OpsWithGlue); + +#if 1 + SDValue RestoreM0 = DAG.getCopyToReg(SaveM0.getValue(1), SL, AMDGPU::M0, + SaveM0.getValue(0), Op.getValue(1)); +#else + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, SL, MVT::Other, MVT::Glue, + SaveM0, SaveM0.getValue(1), Op.getValue(1)); + SDValue RestoreM0(M0, 0); + +#endif + SDValue NewRoot = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + RestoreM0, DAG.getRoot()); + DAG.setRoot(NewRoot); + return Op; +} + +SDValue SITargetLowering::getDefaultM0Value(SelectionDAG &DAG, + const SDLoc &SL) const { + return DAG.getConstant(-1, SL, MVT::i32); + //return DAG.getTargetConstant(-1, SL, MVT::i32); +} + +SDValue SITargetLowering::getNodeWithM0UseRestoreDefault(SelectionDAG &DAG, + unsigned Opc, + const SDLoc &SL, EVT VT, + SDValue InputChain, + ArrayRef Ops, + SDValue M0Val) const { + SDValue Chain = InputChain ? InputChain : DAG.getEntryNode(); + SDVTList VTList = InputChain ? + DAG.getVTList(VT, MVT::Other, MVT::Glue) : DAG.getVTList(VT, MVT::Glue); + + + //SDValue M0Reg = DAG.getRegister(AMDGPU::M0, MVT::i32); + //SDValue NewM0 = copyToM0(DAG, OrigM0.getValue(1), SL, M0Val, + + SmallVector NodeOps; + + SDValue Node; + SDValue OutChain = Chain; + if (M0Val) { + // We can't use S_MOV_B32 directly, because there is no way to specify m0 as + // the destination register. + // + // + // We don't use CopyToReg, because MachineCSE won't combine COPY + // instructions, so we will end up with redundant moves to m0. + // + // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. + // + // A Null SDValue creates a glue result. + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, SL, MVT::Other, + MVT::Glue, M0Val, Chain); + SDValue NewM0(M0, 0); + + if (InputChain) + NodeOps.push_back(NewM0); // Chain + NodeOps.append(Ops.begin(), Ops.end()); // Operands + NodeOps.push_back(NewM0.getValue(1)); // Glue + + Node = DAG.getNode(Opc, SL, VTList, NodeOps); + OutChain = InputChain ? Node : NewM0; + } else { + NodeOps.append(Ops.begin(), Ops.end()); // Operands + Node = DAG.getNode(Opc, SL, VTList, NodeOps); + } + + SDValue OutGlue = Node.getValue(Node->getNumValues() - 1); + SDValue RestoreM0 = DAG.getCopyToReg(OutChain, SL, AMDGPU::M0, + getDefaultM0Value(DAG, SL), OutGlue); + if (InputChain) + return RestoreM0; + + SDValue NewRoot = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + RestoreM0, DAG.getRoot()); + DAG.setRoot(NewRoot); + return Node; } SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, @@ -2695,24 +2876,24 @@ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) return DAG.getUNDEF(MVT::i32); return Op; - case Intrinsic::amdgcn_interp_mov: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p1: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1: { + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_interp_mov ? + AMDGPUISD::INTERP_MOV : AMDGPUISD::INTERP_P1; + + SDValue M0 = Op.getOperand(4); + SDValue Ops[] = { Op.getOperand(1), Op.getOperand(2), Op.getOperand(3) }; + return getNodeWithM0UseRestoreDefault(DAG, Opc, DL, + MVT::f32, SDValue(), Ops, M0); } case Intrinsic::amdgcn_interp_p2: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = SDValue(M0.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - Glue); + SDValue M0 = Op.getOperand(5); + SDValue Ops[] = { + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4) + }; + + return getNodeWithM0UseRestoreDefault(DAG, AMDGPUISD::INTERP_P2, DL, + MVT::f32, SDValue(), Ops, M0); } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); @@ -2873,7 +3054,7 @@ unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case Intrinsic::amdgcn_exp: { + case Intrinsic::amdgcn_exp: { const ConstantSDNode *Tgt = cast(Op.getOperand(2)); const ConstantSDNode *En = cast(Op.getOperand(3)); const ConstantSDNode *Done = cast(Op.getOperand(8)); @@ -2924,10 +3105,9 @@ case Intrinsic::amdgcn_s_sendmsghalt: { unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(NodeOp, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); + SDValue M0 = Op.getOperand(3); + return getNodeWithM0Use(DAG, NodeOp, DL, MVT::Other, + Chain, { Op.getOperand(2) }, M0); } case AMDGPUIntrinsic::SI_tbuffer_store: { SDValue Ops[] = { Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -753,6 +753,10 @@ CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; bool isBasicBlockPrologue(const MachineInstr &MI) const override; + + void emitSetM0ToDefaultValue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL) const; }; namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3784,3 +3784,12 @@ return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); } + +void SIInstrInfo::emitSetM0ToDefaultValue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL) const { + // TODO: Restrict to compile time LDS size or initialize with dynamic size + // from register? + BuildMI(MBB, I, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addImm(-1); +} Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -286,8 +286,8 @@ let DisableWQM = 1; } -let Defs = [M0, EXEC], - UseNamedOperandTable = 1 in { +// These use and define m0, but restore it to its default value. +let Defs = [M0, EXEC], UseNamedOperandTable = 1 in { class SI_INDIRECT_SRC : VPseudoInstSI < (outs VGPR_32:$vdst), @@ -855,13 +855,13 @@ multiclass SI_INDIRECT_Pattern { // Extract with offset def : Pat< - (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), + (eltvt (extractelt_indirect vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), (!cast("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; // Insert with offset def : Pat< - (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), + (insertelt_indirect vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), (!cast("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -141,6 +141,7 @@ unsigned NumSystemSGPRs; private: + bool NeedsM0Initialization; bool HasSpilledSGPRs; bool HasSpilledVGPRs; bool HasNonSpillStackObjects; @@ -379,6 +380,20 @@ return PrivateMemoryPtrUserSGPR; } + bool needsM0Initialization() const { + return NeedsM0Initialization; + } + + /// The default m0 value is to support LDS accesses. If this is true, other + /// uses need to restore this value after mutating m0. + bool shouldPreserveDefaultM0Value() const { + return NeedsM0Initialization; + } + + void setNeedsM0Intialization() { + NeedsM0Initialization = true; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -50,6 +50,7 @@ PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), + NeedsM0Initialization(false), HasSpilledSGPRs(false), HasSpilledVGPRs(false), HasNonSpillStackObjects(false), Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -279,6 +279,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; + let isAllocatable = 0; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { Index: test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -42,7 +42,7 @@ ; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: {{^}}BB{{[0-9]+}}_1: ; %if -; GCN: s_mov_b32 m0, -1 +; GCN-NOT: m0 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] ; VMEM: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; VGPR: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload Index: test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -1,19 +1,32 @@ -; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s ; FIXME: Merge into indirect-addressing-si.ll ; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 ; of the tied implicit use and def of the super register. -; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_load_dword [[IN:s[0-9]+]] -; CHECK: s_mov_b32 m0, [[IN]] -; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] -; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +; GCN-LABEL: {{^}}insert_wo_offset: +; GCN: ; implicit-def: %M0 +; GCN-DAG: s_load_dword [[IN:s[0-9]+]] +; GCN: s_mov_b32 [[NEG1:s[0-9]+]], -1{{$}} + +; MOVREL: s_mov_b32 m0, [[IN]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[ELT0:[0-9]+]] + +; IDXMODE: s_set_gpr_idx_on [[IN]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v +; IDXMODE-NEXT: s_set_gpr_idx_off + +; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: s_mov_b32 m0, [[NEG1]] +; GCN-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: +define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) #0 { entry: %ins = insertelement <4 x float> , float 5.0, i32 %in store <4 x float> %ins, <4 x float> addrspace(1)* %out ret void } +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll =================================================================== --- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -3,16 +3,19 @@ ; Make sure that m0 is not reinitialized in the loop. ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init: +; Initialize in entry block only +; GCN: s_mov_b32 m0, -1 +; GCN-NOT: m0 ; GCN: s_cbranch_scc1 BB0_3 +; GCN-NOT: m0 -; Initialize in preheader -; GCN: s_mov_b32 m0, -1 ; GCN: BB0_2: ; GCN-NOT: m0 ; GCN: ds_read_b32 ; GCN-NOT: m0 ; GCN: buffer_store_dword +; GCN-NOT: m0 ; GCN: s_cbranch_scc0 BB0_2 @@ -46,4 +49,32 @@ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } +; GCN-LABEL: {{^}}diamond_lds_m0_init: +; GCN: s_mov_b32 m0, -1 +; GCN-NOT: m0 +; GCN: ds_read_b32 +; GCN-NOT: m0 +; GCN: ds_read_b32 +; GCN-NOT: s_mov_b32 m0 +define void @diamond_lds_m0_init(i32 addrspace(1)* noalias nocapture %out0, i32 addrspace(1)* noalias nocapture %out1, i32 addrspace(3)* noalias nocapture readonly %in0, i32 addrspace(3)* noalias nocapture readonly %in1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %then, label %else + +then: + %val0 = load volatile i32, i32 addrspace(3)* %in0 + store volatile i32 %val0, i32 addrspace(1)* %out0 + call void asm "", ""() + br label %endif + +else: + %val1 = load volatile i32, i32 addrspace(3)* %in1 + store volatile i32 %val1, i32 addrspace(1)* %out1 + call void asm "", ""() + br label %endif + +endif: + ret void +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -11,15 +11,15 @@ ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} -define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) { +define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg %m0, <2 x float> %arg3) { main_body: - %i = extractelement <2 x float> %4, i32 0 - %j = extractelement <2 x float> %4, i32 1 - %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3) - %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3) - %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3) - %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3) - %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3) + %i = extractelement <2 x float> %arg3, i32 0 + %j = extractelement <2 x float> %arg3, i32 1 + %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %m0) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %m0) + %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %m0) + %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) %w = fadd float %p1_1, %const call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w) ret void @@ -156,6 +156,7 @@ ret void } +; FIXME: Dead def of m0 emitted ; SI won't merge ds memory operations, because of the signed offset bug, so ; we only have check lines for VI. ; VI-LABEL: v_interp_readnone: @@ -173,6 +174,34 @@ ret void } +; GCN-LABEL: {{^}}v_interp_lds_restore: +; GCN: s_wqm +; GCN-NEXT: s_mov_b32 m0, s6 +; GCN-NEXT: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.x{{$}} +; GCN: s_mov_b32 m0, -1{{$}} +; GCN-NOT: m0 +; GCN: ds_write_b32 +; GCN-NEXT: s_mov_b32 m0, s6 + +; GCN: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} +; GCN-NEXT: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} +; GCN-NEXT: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} +; GCN-NOT: m0 +define amdgpu_ps void @v_interp_lds_restore(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg %m0, <2 x float> %arg3) { +main_body: + %i = extractelement <2 x float> %arg3, i32 0 + %j = extractelement <2 x float> %arg3, i32 1 + %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %m0) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %m0) + store volatile float 4.0, float addrspace(3)* undef + %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %m0) + %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) + %w = fadd float %p1_1, %const + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w) + ret void +} + ; Thest that v_interp_p1 uses different source and destination registers ; on 16 bank LDS chips. @@ -209,6 +238,42 @@ ret void } +; Put the interps out of the entry block, obscuring the dead def of m0 +; in the entry. +; FIXME: getting dead def +; GCN-LABEL: {{^}}v_interp_no_dead_setup: +; GCN: ; implicit-def: %M0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_cbranch_scc1 + +; GCN: s_mov_b32 m0, s6 +; GCN: v_interp_p1_f32 + +; FIXME: Dead +; GCN: s_mov_b32 m0, -1 +define amdgpu_ps void @v_interp_no_dead_setup(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg %m0, <2 x float> %arg3) { +entry: + %i = extractelement <2 x float> %arg3, i32 0 + %j = extractelement <2 x float> %arg3, i32 1 + call void asm sideeffect "", ""() + br i1 undef, label %if, label %endif + +if: + %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %m0) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %m0) + %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %m0) + %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) + %w = fadd float %p1_1, %const + store volatile float %p0_0, float addrspace(1)* undef + store volatile float %p0_1, float addrspace(1)* undef + store volatile float %w, float addrspace(1)* undef + br label %endif + +endif: + ret void +} + declare float @llvm.fabs.f32(float) #0 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -61,6 +61,36 @@ ret void } +; GCN-LABEL: {{^}}test_interrupt_undef_m0: +; GCN-NOT: s_mov_b32 m0 +; GCN: s_sendmsg sendmsg(MSG_INTERRUPT) +; GCN-NOT: s_mov_b32 m0 +define void @test_interrupt_undef_m0() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 undef); + ret void +} + +; FIXME: Should not get any m0 def +; GCN-LABEL: {{^}}test_interrupt_undef_m0_nonentry: +; GCN: ; implicit-def: %M0 +; GCN-NOT: s_mov_b32 m0 +; GCN: s_cbranch_scc1 +; GCN: s_mov_b32 s0, m0 +; GCN: s_sendmsg sendmsg(MSG_INTERRUPT) +; GCN-NOT: s_mov_b32 m0 +define void @test_interrupt_undef_m0_nonentry() { +entry: + br i1 undef, label %if, label %endif + +if: + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 undef) + br label %endif + +endif: + ret void +} + ; GCN-LABEL: {{^}}sendmsghalt: ; GCN: s_mov_b32 m0, s0 ; VI-NEXT: s_nop 0 Index: test/CodeGen/AMDGPU/regcoalesce-dbg.mir =================================================================== --- test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -9,7 +9,7 @@ --- | define void @test(i32 addrspace(1)* %out) { ret void } - + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4) !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6) !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !5) @@ -25,15 +25,15 @@ --- name: test tracksRegLiveness: true -registers: +registers: - { id: 0, class: sgpr_64 } - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_32_xm0_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -47,13 +47,13 @@ - { id: 18, class: vgpr_32 } - { id: 19, class: vreg_64 } - { id: 20, class: vreg_64 } -liveins: +liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | bb.0: liveins: %sgpr0_sgpr1, %vgpr0 - + %3 = COPY killed %vgpr0 %0 = COPY killed %sgpr0_sgpr1 %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) Index: test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_ptr.ll +++ test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -69,8 +69,9 @@ ; pointer can be used with an offset into the second one. ; SI-LABEL: {{^}}load_shl_base_lds_2: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: s_mov_b32 m0, -1 +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} + ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; SI: s_endpgm define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { Index: test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir =================================================================== --- test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -109,8 +109,8 @@ - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -125,8 +125,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } @@ -193,8 +193,8 @@ - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -209,8 +209,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } @@ -277,8 +277,8 @@ - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -293,8 +293,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } @@ -360,8 +360,8 @@ - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -376,8 +376,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } @@ -445,8 +445,8 @@ - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -461,8 +461,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } @@ -526,12 +526,12 @@ registers: - { id: 0, class: sgpr_64 } - { id: 1, class: sreg_32_xm0 } - - { id: 2, class: sgpr_32 } + - { id: 2, class: sreg_32_xm0 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64_xexec } - { id: 5, class: sreg_64_xexec } - - { id: 6, class: sreg_32 } - - { id: 7, class: sreg_32 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } - { id: 8, class: sreg_32_xm0 } - { id: 9, class: sreg_64 } - { id: 10, class: sreg_32_xm0 } @@ -546,8 +546,8 @@ - { id: 19, class: vgpr_32 } - { id: 20, class: vreg_64 } - { id: 21, class: sreg_32_xm0 } - - { id: 22, class: sreg_32 } - - { id: 23, class: sreg_32 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } - { id: 24, class: vgpr_32 } - { id: 25, class: vreg_64 } - { id: 26, class: vgpr_32 } Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -63,27 +63,29 @@ ; m0 is killed, so it isn't necessary during the entry block spill to preserve it ; GCN-LABEL: {{^}}spill_kill_m0_lds: +; GCN: s_mov_b32 m0, -1 +; GCN: s_mov_b32 s1, -1 ; GCN: s_mov_b32 m0, s6 ; GCN: v_interp_mov_f32 -; TOSMEM-NOT: s_m0 ; TOSMEM: s_mov_b32 m0, s7 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; TOSMEM-NEXT: m0, vcc_hi +; TOSMEM-NEXT: vcc_hi, m0 -; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s7, 0x100 -; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 +; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: +; TOSMEM-NEXT: s_mov_b32 vcc_hi, m0 ; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload - +; TOSMEM-NEXT: m0, vcc_hi ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 @@ -131,8 +133,10 @@ ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: +; TOSMEM-NEXT: s_mov_b32 vcc_hi, m0 ; TOSMEM-NEXT: s_mov_b32 m0, s3 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 @@ -160,29 +164,27 @@ ; GCN-LABEL: {{^}}restore_m0_lds: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 -; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_mov_b32 vcc_hi, m0 +; TOSMEM-NEXT: s_mov_b32 m0, s3 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi +; TOSMEM-NEXT: s_mov_b32 vcc_hi, m0 +; TOSMEM-NEXT: s_add_u32 m0, s3, 0x200 +; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi ; TOSMEM: s_cbranch_scc1 -; TOSMEM: s_mov_b32 m0, -1 - ; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM-NEXT: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM: ds_write_b64 - -; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM-NEXT: s_mov_b32 vcc_hi, m0 +; TOSMEM-NEXT: s_add_u32 m0, s3, 0x200 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: m0 +; TOSMEM-NEXT: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s0 Index: test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir +++ test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir @@ -122,15 +122,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -184,15 +184,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -250,15 +250,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -320,15 +320,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -389,15 +389,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -454,15 +454,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -526,15 +526,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -593,15 +593,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 } @@ -659,15 +659,15 @@ tracksRegLiveness: true registers: - { id: 0, class: sreg_64 } - - { id: 1, class: sreg_32 } + - { id: 1, class: sreg_32_xm0 } - { id: 2, class: sgpr_32 } - { id: 3, class: vgpr_32 } - { id: 4, class: sreg_64 } - - { id: 5, class: sreg_32 } + - { id: 5, class: sreg_32_xm0 } - { id: 6, class: sreg_64 } - - { id: 7, class: sreg_32 } - - { id: 8, class: sreg_32 } - - { id: 9, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32_xm0 } + - { id: 9, class: sreg_32_xm0 } - { id: 10, class: sreg_128 } - { id: 11, class: vgpr_32 } - { id: 12, class: vgpr_32 }