Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -218,9 +218,10 @@ FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes + ENDPGM, + RETURN, DWORDADDR, FRACT, CLAMP, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -652,7 +652,7 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); + return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } //===---------------------------------------------------------------------===// @@ -2722,10 +2722,11 @@ // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes + NODE_NAME_CASE(ENDPGM) + NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -261,5 +261,8 @@ //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -17,7 +17,6 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" -#include "R600InstrInfo.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" @@ -107,6 +106,29 @@ ++I; } } else { + // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder + // terminator instructions and should only be printed as comments. + if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + if (isVerbose()) { + SmallVector BBStr; + raw_svector_ostream Str(BBStr); + + const MachineBasicBlock *MBB = MI->getOperand(1).getMBB(); + const MCSymbolRefExpr *Expr + = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); + Expr->print(Str, MAI); + OutStreamer->emitRawComment(" mask branch " + BBStr); + } + + return; + } + + if (MI->getOpcode() == AMDGPU::SI_RETURN) { + if (isVerbose()) + OutStreamer->emitRawComment(" return"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -448,8 +448,8 @@ addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); - addPass(createSILowerControlFlowPass(), false); - addPass(createSIDebuggerInsertNopsPass(), false); + addPass(createSILowerControlFlowPass()); + addPass(createSIDebuggerInsertNopsPass()); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { Index: lib/Target/AMDGPU/R600Instructions.td =================================================================== --- lib/Target/AMDGPU/R600Instructions.td +++ lib/Target/AMDGPU/R600Instructions.td @@ -1539,8 +1539,9 @@ //===---------------------------------------------------------------------===// let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(AMDGPUendpgm)] + >; } //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1002,7 +1002,8 @@ if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); + unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, @@ -1463,8 +1464,8 @@ // FIXME: This should really be selected to s_trap, but that requires // setting up the trap handler for it o do anything. - return DAG.getNode(AMDGPUISD::RET_FLAG, SDLoc(Op), MVT::Other, Op. - getOperand(0)); + return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, + Op.getOperand(0)); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -11,8 +11,9 @@ // //===----------------------------------------------------------------------===// -class InstSI pattern> : - AMDGPUInst, PredicateControl { +class InstSI pattern = []> : + AMDGPUInst, PredicateControl { field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -426,7 +426,7 @@ let isTerminator = 1 in { def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(IL_retflag)]> { + [(AMDGPUendpgm)]> { let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; @@ -1908,7 +1908,7 @@ } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0 let hasSideEffects = 1, SALU = 1 in { -def SGPR_USE : InstSI <(outs),(ins), "", []>; +def SGPR_USE : InstSI <(outs), (ins)>; } let usesCustomInserter = 1, SALU = 1 in { @@ -1919,61 +1919,57 @@ // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { +let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1 in { + +// Dummy terminator instruction to use after control flow instructions +// replaced with exec mask operations. +def SI_MASK_BRANCH : InstSI < + (outs SReg_64:$dst), (ins brtarget:$target)> { + let isBranch = 1; + let isTerminator = 1; + let isBarrier = 1; + let SALU = 1; +} + let Uses = [EXEC], Defs = [EXEC] in { let isBranch = 1, isTerminator = 1 in { def SI_IF: InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, brtarget:$target), - "", + (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "", [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))] >; def SI_ELSE : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target), - "", - [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))] -> { + (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "", + [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } def SI_LOOP : InstSI < - (outs), - (ins SReg_64:$saved, brtarget:$target), - "si_loop $saved, $target", + (outs), (ins SReg_64:$saved, brtarget:$target), "", [(int_amdgcn_loop i64:$saved, bb:$target)] >; } // End isBranch = 1, isTerminator = 1 def SI_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src), - "si_else $dst, $src", + (outs SReg_64:$dst), (ins SReg_64:$src), "", [(set i64:$dst, (int_amdgcn_break i64:$src))] >; def SI_IF_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, SReg_64:$src), - "si_if_break $dst, $vcc, $src", + (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), "", [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src0, SReg_64:$src1), - "si_else_break $dst, $src0, $src1", + (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "", [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : InstSI < - (outs), - (ins SReg_64:$saved), - "si_end_cf $saved", + (outs), (ins SReg_64:$saved), "", [(int_amdgcn_end_cf i64:$saved)] >; @@ -1981,30 +1977,24 @@ let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : InstSI < - (outs), - (ins VSrc_32:$src), - "si_kill $src", + (outs), (ins VSrc_32:$src), "", [(int_AMDGPU_kill f32:$src)] >; } // End Uses = [EXEC], Defs = [EXEC,VCC] } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 -let SALU = 1 in def SI_PS_LIVE : InstSI < - (outs SReg_64:$dst), - (ins), - "si_ps_live $dst", - [(set i1:$dst, (int_amdgcn_ps_live))] ->; + (outs SReg_64:$dst), (ins), "", + [(set i1:$dst, (int_amdgcn_ps_live))]> { + let SALU = 1; +} // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to // fold operands before it runs. -def SI_INIT_M0 : InstSI < - (outs), - (ins SSrc_32:$src), "", []> { +def SI_INIT_M0 : InstSI <(outs), (ins SSrc_32:$src)> { let Defs = [M0]; let usesCustomInserter = 1; let isPseudo = 1; @@ -2014,21 +2004,28 @@ let isReMaterializable = 1; } +def SI_RETURN : InstSI < + (outs), (ins variable_ops), "", [(AMDGPUreturn)]> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let isPseudo = 1; + let isCodeGenOnly = 1; + let hasSideEffects = 1; + let SALU = 1; + let hasNoSchedulingInfo = 1; +} + let Uses = [EXEC], Defs = [EXEC, VCC, M0] in { class SI_INDIRECT_SRC : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), - (ins rc:$src, VSrc_32:$idx, i32imm:$off), - "si_indirect_src $dst, $temp, $src, $idx, $off", - [] + (ins rc:$src, VSrc_32:$idx, i32imm:$off) >; class SI_INDIRECT_DST : InstSI < (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), - "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", - [] -> { + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> { let Constraints = "$src = $dst"; } @@ -2052,16 +2049,14 @@ let UseNamedOperandTable = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx), - "", []> { + (ins sgpr_class:$src, i32imm:$frame_idx)> { let mayStore = 1; let mayLoad = 0; } def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx), - "", []> { + (ins i32imm:$frame_idx)> { let mayStore = 0; let mayLoad = 1; } @@ -2082,8 +2077,7 @@ def _SAVE : InstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset, i32imm:$offset), - "", []> { + SReg_32:$scratch_offset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; } @@ -2091,8 +2085,7 @@ def _RESTORE : InstSI < (outs vgpr_class:$dst), (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, - i32imm:$offset), - "", []> { + i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; } Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -88,10 +88,14 @@ void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, + MachineInstr *MovRel, + unsigned SaveReg, unsigned IdxReg, int Offset); + + bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); - void IndirectSrc(MachineInstr &MI); - void IndirectDst(MachineInstr &MI); + bool indirectSrc(MachineInstr &MI); + bool indirectDst(MachineInstr &MI); public: static char ID; @@ -104,11 +108,6 @@ const char *getPassName() const override { return "SI Lower control flow pseudo instructions"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } }; } // End anonymous namespace @@ -227,6 +226,10 @@ Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg) + .addOperand(MI.getOperand(2)); + MI.eraseFromParent(); } @@ -255,6 +258,10 @@ Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst) + .addOperand(MI.getOperand(2)); + MI.eraseFromParent(); } @@ -331,7 +338,8 @@ } void SILowerControlFlow::Branch(MachineInstr &MI) { - if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) + MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); + if (MBB == MI.getParent()->getNextNode()) MI.eraseFromParent(); // If these aren't equal, this is probably an infinite loop. @@ -365,75 +373,109 @@ MI.eraseFromParent(); } -void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { +void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, + DebugLoc DL, + MachineInstr *MovRel, + unsigned SaveReg, + unsigned IdxReg, + int Offset) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) + .addReg(IdxReg); + + // Move index from VCC into M0 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(IdxReg); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset) { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + + // Do the actual move + LoopBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&LoopBB); +} + +// Returns true if a new block was inserted. +bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock::iterator I = MI; - unsigned Save = MI.getOperand(1).getReg(); unsigned Idx = MI.getOperand(3).getReg(); if (AMDGPU::SReg_32RegClass.contains(Idx)) { if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) - .addImm(Offset); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx) + .addImm(Offset); } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); } - MBB.insert(I, MovRel); - } else { - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + MBB.insert(I, MovRel); + MI.eraseFromParent(); + return false; + } - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); + MachineFunction &MF = *MBB.getParent(); + unsigned Save = MI.getOperand(1).getReg(); - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); + // Reading from a VGPR requires looping over all workitems in the wavefront. + assert(AMDGPU::SReg_64RegClass.contains(Save) && + AMDGPU::VGPR_32RegClass.contains(Idx)); - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(Idx); + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + MF.insert(MBBI, LoopBB); + MF.insert(MBBI, RemainderBB); - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - // Do the actual move - MBB.insert(I, MovRel); + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7); + emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset); - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); - } MI.eraseFromParent(); + return true; } /// \param @VecReg The register which holds element zero of the vector @@ -463,8 +505,8 @@ Reg = RC->getRegister(RegIdx); } -void SILowerControlFlow::IndirectSrc(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); @@ -480,11 +522,11 @@ .addReg(Reg) .addReg(Vec, RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Off); } -void SILowerControlFlow::IndirectDst(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectDst(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); @@ -501,7 +543,7 @@ .addReg(Val) .addReg(Dst, RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Off); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { @@ -514,11 +556,14 @@ bool NeedFlat = false; unsigned Depth = 0; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + MachineFunction::iterator NextBB; - MachineBasicBlock *EmptyMBBAtEnd = NULL; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; BI = NextBB) { + NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + + MachineBasicBlock *EmptyMBBAtEnd = nullptr; MachineBasicBlock::iterator I, Next; bool ExecModified = false; @@ -591,7 +636,15 @@ case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: - IndirectSrc(MI); + if (indirectSrc(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; case AMDGPU::SI_INDIRECT_DST_V1: @@ -599,7 +652,15 @@ case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: - IndirectDst(MI); + if (indirectDst(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; case AMDGPU::S_ENDPGM: { Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -174,6 +174,213 @@ ret void } +; When the block is split to insert the loop, make sure any other +; places that need to be expanded in the same block are also handled. + +; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: + +; CHECK: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] +; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7 +; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9 +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]] +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]] +; CHECK: s_waitcnt vmcnt(0) + +; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec + +; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: +; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] +; CHECK: s_mov_b32 m0, vcc_lo +; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]] +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; CHECK: s_mov_b64 exec, [[MASK]] +; CHECK: s_mov_b64 [[MASK]], exec + +; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]: +; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] +; CHECK: s_mov_b32 m0, vcc_lo +; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]] +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK: s_cbranch_execnz [[LOOP1]] + +; CHECK: buffer_store_dword [[MOVREL0]] +; CHECK: buffer_store_dword [[MOVREL1]] +define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %val0 = extractelement <4 x i32> , i32 %idx0 + %val1 = extractelement <4 x i32> , i32 %idx1 + store volatile i32 %val0, i32 addrspace(1)* %out0 + store volatile i32 %val1, i32 addrspace(1)* %out0 + ret void +} + +; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: +; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} +; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]] +; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]] +; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62 +; CHECK-DAG: s_waitcnt vmcnt(0) + +; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec + +; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]: +; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] +; CHECK: s_mov_b32 m0, vcc_lo +; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]] +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; CHECK: s_mov_b64 exec, [[MASK]] +; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63 +; CHECK: s_mov_b64 [[MASK]], exec + +; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]: +; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]] +; CHECK: s_mov_b32 m0, vcc_lo +; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]] +; CHECK: s_and_saveexec_b64 vcc, vcc +; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]] +; CHECK-NEXT: s_xor_b64 exec, exec, vcc +; CHECK: s_cbranch_execnz [[LOOP1]] + +; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]: +define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0 + %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 + store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 + ret void +} + +; CHECK-LABEL: {{^}}extract_adjacent_blocks: +; CHECK: s_load_dword [[ARG:s[0-9]+]] +; CHECK: s_cmp_lg_i32 +; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]] + +; CHECK: buffer_load_dwordx4 +; CHECK: s_mov_b32 m0, +; CHECK: v_movrels_b32_e32 +; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]] + +; CHECK: [[BB4]]: +; CHECK: buffer_load_dwordx4 +; CHECK: s_mov_b32 m0, +; CHECK: v_movrels_b32_e32 + +; CHECK: [[ENDBB]]: +; CHECK: buffer_store_dword +; CHECK: s_endpgm +define void @extract_adjacent_blocks(i32 %arg) #0 { +bb: + %tmp = icmp eq i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb4 + +bb1: + %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef + %tmp3 = extractelement <4 x float> %tmp2, i32 undef + br label %bb7 + +bb4: + %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef + %tmp6 = extractelement <4 x float> %tmp5, i32 undef + br label %bb7 + +bb7: + %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] + store volatile float %tmp8, float addrspace(1)* undef + ret void +} + +; CHECK-LABEL: {{^}}insert_adjacent_blocks: +; CHECK: s_load_dword [[ARG:s[0-9]+]] +; CHECK: s_cmp_lg_i32 +; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]] + +; CHECK: buffer_load_dwordx4 +; CHECK: s_mov_b32 m0, +; CHECK: v_movreld_b32_e32 +; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]] + +; CHECK: [[BB4]]: +; CHECK: buffer_load_dwordx4 +; CHECK: s_mov_b32 m0, +; CHECK: v_movreld_b32_e32 + +; CHECK: [[ENDBB]]: +; CHECK: buffer_store_dword +; CHECK: s_endpgm +define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { +bb: + %tmp = icmp eq i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb4 + +bb1: ; preds = %bb + %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef + %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef + br label %bb7 + +bb4: ; preds = %bb + %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef + %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef + br label %bb7 + +bb7: ; preds = %bb4, %bb1 + %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] + store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef + ret void +} + +; FIXME: Should be able to fold zero input to movreld to inline imm? + +; CHECK-LABEL: {{^}}multi_same_block: +; CHECK: s_load_dword [[ARG:s[0-9]+]] +; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; CHECK-DAG: s_add_i32 m0, [[ARG]], -16 +; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]] + +; CHECK: s_add_i32 m0, [[ARG]], -14 +; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} + +; CHECK: s_mov_b32 m0, -1 +; CHECK: ds_write_b32 +; CHECK: ds_write_b32 +; CHECK: s_endpgm +define void @multi_same_block(i32 %arg) #0 { +bb: + %tmp1 = add i32 %arg, -16 + %tmp2 = insertelement <6 x float> , float 0.000000e+00, i32 %tmp1 + %tmp3 = add i32 %arg, -16 + %tmp4 = insertelement <6 x float> , float 0x3FB99999A0000000, i32 %tmp3 + %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32> + %tmp6 = extractelement <6 x i32> %tmp5, i32 1 + %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32> + %tmp8 = extractelement <6 x i32> %tmp7, i32 5 + store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4 + store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/ret_jump.ll =================================================================== --- test/CodeGen/AMDGPU/ret_jump.ll +++ test/CodeGen/AMDGPU/ret_jump.ll @@ -1,17 +1,22 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -target triple = "amdgcn--" +; This should end with an no-op sequence of exec mask manipulations +; Mask should be in original state after executed unreachable block ; GCN-LABEL: {{^}}main: -; GCN: BB0_3: -; GCN-NEXT: s_branch [[LASTBB:BB[0-9]*_[0-9]*]] -; GCN-NEXT: BB0_ -; GCN: [[LASTBB]] -; GCN-NEXT: .Lfunc_end0: -; ModuleID = 'bugpoint-reduced-simplified.bc' -target triple = "amdgcn--" +; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] +; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]] + +; GCN: [[RET_BB]]: +; GCN-NEXT: ; return + +; GCN-NEXT: [[UNREACHABLE_BB]]: +; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]] +; GCN-NEXT: .Lfunc_end0 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { main_body: %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -122,9 +122,13 @@ ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] -;CHECK-NEXT: %ELSE -;CHECK: store -;CHECK: %END +;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] +;CHECK-NEXT: ; BB#3: ; %ELSE +;CHECK: store_dword +;CHECK: [[END_BB]]: ; %END +;CHECK: s_or_b64 exec, exec, +;CHECK: v_mov_b32_e32 v0 +;CHECK: ; return define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0