Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -74,6 +74,7 @@ void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; +FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -43,6 +43,7 @@ AU.setPreservesAll(); } + void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); }; @@ -57,13 +58,28 @@ char AMDGPUAnnotateUniformValues::ID = 0; +static void setUniformMetadata(Instruction *I) { + I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); +} + +void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { + if (I.isUnconditional()) + return; + + Value *Cond = I.getCondition(); + if (!DA->isUniform(Cond)) + return; + + setUniformMetadata(I.getParent()->getTerminator()); +} + void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; if (Instruction *PtrI = dyn_cast(Ptr)) - PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + setUniformMetadata(PtrI); } Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" @@ -36,6 +37,20 @@ //===----------------------------------------------------------------------===// namespace { + +static bool isCBranchSCC(const SDNode *N) { + assert(N->getOpcode() == ISD::BRCOND); + if (!N->hasOneUse()) + return false; + + SDValue Cond = N->getOperand(1); + if (Cond.getOpcode() == ISD::CopyToReg) + Cond = Cond.getOperand(2); + return Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getValueType() == MVT::i32 && + Cond.hasOneUse(); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -82,6 +97,8 @@ bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; + bool isUniformBr(const SDNode *N) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -143,6 +160,7 @@ uint32_t Offset, uint32_t Width); SDNode *SelectS_BFEFromShifts(SDNode *N); SDNode *SelectS_BFE(SDNode *N); + SDNode *SelectBRCOND(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -509,6 +527,8 @@ break; return SelectS_BFE(N); + case ISD::BRCOND: + return SelectBRCOND(N); } return SelectCode(N); @@ -623,6 +643,11 @@ return false; } +bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + return BB->getTerminator()->getMetadata("amdgpu.uniform"); +} + const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -1365,6 +1390,36 @@ return SelectCode(N); } +SDNode *AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { + SDValue Cond = N->getOperand(1); + + if (isCBranchSCC(N)) { + // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + return SelectCode(N); + } + + // The result of VOPC instructions is or'd against ~EXEC before it is + // written to vcc or another SGPR. This means that the value '1' is always + // written to the corresponding bit for results that are masked. In order + // to correctly check against vccz, we need to and VCC with the EXEC + // register in order to clear the value from the masked bits. + + SDLoc SL(N); + + SDNode *MaskedCond = + CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, + CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), + Cond); + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, + SDValue(MaskedCond, 0), + SDValue()); // Passing SDValue() adds a + // glue output. + return CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0), // Chain + VCC.getValue(1)); // Glue +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -239,10 +239,7 @@ bool AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); return false; } @@ -262,6 +259,9 @@ bool R600PassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.IsIRStructurizerEnabled()) + addPass(createStructurizeCFGPass()); addPass(createR600TextureIntrinsicsReplacer()); return false; } @@ -300,11 +300,11 @@ // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); - + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); addPass(createAMDGPUAnnotateUniformValues()); + addPass(createSIAnnotateControlFlowPass()); return false; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -43,6 +44,7 @@ static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; class SIAnnotateControlFlow : public FunctionPass { + DivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -104,6 +106,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -114,6 +117,7 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -199,6 +203,9 @@ /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -206,6 +213,9 @@ /// \brief Close the last "If" block and open a new "Else" block void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } Value *Ret = CallInst::Create(Else, popSaved(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -284,6 +294,10 @@ /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } + BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); BasicBlock *Target = Term->getSuccessor(1); @@ -305,6 +319,9 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); + if (Stack.back().first != BB) + return; + if (L && L->getHeader() == BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be @@ -320,14 +337,18 @@ BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); + Value *Exec = popSaved(); + if (!isa(Exec)) + CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { @@ -337,12 +358,14 @@ if (!Term || Term->isUnconditional()) { if (isTopOfStack(*I)) closeControlFlow(*I); + continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(*I)) closeControlFlow(*I); + handleLoop(Term); continue; } Index: lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -108,9 +108,20 @@ return new SIFixSGPRLiveRanges(); } +static bool hasOnlyScalarBr(const MachineBasicBlock *MBB, + const SIInstrInfo *TII) { + for (MachineBasicBlock::const_iterator I = MBB->getFirstTerminator(), + E = MBB->end(); I != E; ++I) { + if (!TII->isSOPP(*I)) + return false; + } + return true; +} + bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = static_cast( MF.getSubtarget().getRegisterInfo()); bool MadeChange = false; @@ -147,7 +158,7 @@ } } - if (MBB->succ_size() < 2) + if (MBB->succ_size() < 2 || hasOnlyScalarBr(MBB, TII)) continue; // We have structured control flow, so the number of successors should be Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -60,6 +60,8 @@ bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; + + bool isCFIntrinsic(const SDNode *Intr) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -130,6 +130,10 @@ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); for (MVT VT : MVT::integer_valuetypes()) { if (VT == MVT::i64) @@ -1183,6 +1187,23 @@ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); } +bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { + if (!Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) + return false; + + switch (cast(Intr->getOperand(1))->getZExtValue()) { + default: return false; + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + case AMDGPUIntrinsic::amdgcn_loop: + case AMDGPUIntrinsic::amdgcn_end_cf: + return true; + } +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -1193,13 +1214,11 @@ SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); SDNode *BR = nullptr; + SDNode *SetCC = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); + SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); } else { @@ -1208,7 +1227,16 @@ Target = BR->getOperand(1); } - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) { + // This is a uniform branch so we don't need to legalize. + return BRCOND; + } + + assert(!SetCC || + (SetCC->getConstantOperandVal(1) == 1 && + isCFIntrinsic(Intr) && + cast(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // Build the result and ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -59,6 +59,9 @@ unsigned Reg, MachineRegisterInfo &MRI, SmallVectorImpl &Worklist) const; + void addSCCDefUsersToVALUWorklist( + MachineInstr *SCCDefInst, SmallVectorImpl &Worklist) const; + const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1437,6 +1437,16 @@ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes + // all SCC users are in the same blocks as their defs. + const MachineBasicBlock *MBB = MI->getParent(); + if (MI == &MBB->front()) { + if (MBB->isLiveIn(AMDGPU::SCC)) { + ErrInfo = "scc register cannot be live across blocks."; + return false; + } + } + // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && @@ -1605,6 +1615,12 @@ case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORD_SGPR: case AMDGPU::S_LOAD_DWORD_IMM_ci: @@ -1621,6 +1637,8 @@ case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } @@ -1949,7 +1967,7 @@ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Legalize VOP2 - if (isVOP2(*MI)) { + if (isVOP2(*MI) || isVOPC(*MI)) { legalizeOperandsVOP2(MRI, MI); return; } @@ -2520,6 +2538,14 @@ Inst->eraseFromParent(); continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + // Clear unused bits of vcc + BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2541,8 +2567,10 @@ // both. for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { Inst->RemoveOperand(i); + addSCCDefUsersToVALUWorklist(Inst, Worklist); + } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { @@ -2575,19 +2603,24 @@ Inst->addOperand(MachineOperand::CreateImm(BitWidth)); } - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); - if (!NewDstRC) - continue; + bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); + unsigned NewDstReg = AMDGPU::NoRegister; + if (HasDst) { + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); + unsigned DstReg = Inst->getOperand(0).getReg(); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } // Legalize the operands legalizeOperands(Inst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (HasDst) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } @@ -2862,6 +2895,22 @@ } } +void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, + SmallVectorImpl &Worklist) const { + // This assumes that all the users of SCC are in the same block + // as the SCC def. + for (MachineBasicBlock::iterator I = SCCDefInst, + E = SCCDefInst->getParent()->end(); I != E; ++I) { + + // Exit if we find another SCC def. + if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + return; + + if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + Worklist.push_back(I); + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -247,6 +247,30 @@ return cast(N)->getMemoryVT() == MVT::i16; }]>; +def si_setcc_uniform : PatFrag < + (ops node:$lhs, node:$rhs, node:$cond), + (setcc node:$lhs, node:$rhs, node:$cond), [{ + for (SDNode *Use : N->uses()) { + if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg) + return false; + + unsigned Reg = cast(Use->getOperand(1))->getReg(); + if (Reg != AMDGPU::SCC) + return false; + } + return true; +}]>; + +def si_uniform_br : PatFrag < + (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{ + return isUniformBr(N); +}]>; + +def si_uniform_br_scc : PatFrag < + (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{ + return isCBranchSCC(N); +}]>; + multiclass SIAtomicM0Glue2 { def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, @@ -824,7 +848,8 @@ class SOPC_Helper op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []> { + opName#" $src0, $src1", + [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { let Defs = [SCC]; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -336,18 +336,18 @@ // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; ////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; ////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; ////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; @@ -449,7 +449,8 @@ >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16" + "s_cbranch_scc1 $simm16", + [(si_uniform_br_scc SCC, bb:$simm16)] >; } // End Uses = [SCC] @@ -2130,7 +2131,7 @@ def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, + (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 0), sub1)) >; @@ -3030,10 +3031,12 @@ def : ZExt_i64_i1_Pat; def : ZExt_i64_i1_Pat; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple outputs. def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SGPR_32)), sub1) >; def : Pat < Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -420,7 +420,8 @@ &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, - &AMDGPU::SReg_512RegClass + &AMDGPU::SReg_512RegClass, + &AMDGPU::SCC_CLASSRegClass, }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -435,6 +436,8 @@ // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { switch (RC->getSize()) { + case 0: return false; + case 1: return false; case 4: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; case 8: Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -81,6 +81,11 @@ // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { + let CopyCost = -1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers Index: test/CodeGen/AMDGPU/and-gcn.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/and-gcn.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}v_and_i64_br: +; SI: v_and_b32 +; SI: v_and_b32 +define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { +entry: + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %if, label %endif + +if: + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %and = and i64 %a, %b + br label %endif + +endif: + %tmp1 = phi i64 [%and, %if], [0, %entry] + store i64 %tmp1, i64 addrspace(1)* %out, align 8 + ret void +} + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -244,26 +244,6 @@ ret void } -; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 -define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %endif - -if: - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %and = and i64 %a, %b - br label %endif - -endif: - %tmp1 = phi i64 [%and, %if], [0, %entry] - store i64 %tmp1, i64 addrspace(1)* %out, align 8 - ret void -} - ; FUNC-LABEL: {{^}}v_and_constant_i64: ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}} ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -15,11 +15,12 @@ ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: ; GCN: {{^}}BB0_2: -define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -44,11 +45,12 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -70,11 +72,12 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} ; GCN: {{^}}BB2_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -96,11 +99,12 @@ ; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} ; GCN: {{^}}BB3_2: ; GCN: s_or_b64 exec -define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -127,14 +131,15 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} ; GCN: {{^}}BB4_2: -define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %add.arg = add i32 %arg, 8 %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -163,14 +168,15 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: {{^}}BB5_2: -define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %add.arg = add i32 %arg, 8 %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -196,12 +202,13 @@ ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN: {{^}}BB6_2: -define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { +define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %offset.ext = zext i32 %offset to i64 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -230,11 +237,12 @@ ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -258,11 +266,12 @@ ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -290,11 +299,12 @@ ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -321,11 +331,12 @@ ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -351,11 +362,12 @@ ; GCN: s_addc_u32 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -380,11 +392,12 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -417,11 +430,12 @@ ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 - %tmp0 = icmp eq i32 %cond, 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: @@ -436,3 +450,7 @@ done: ret void } + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/endcf-loop-header.ll =================================================================== --- test/CodeGen/AMDGPU/endcf-loop-header.ll +++ test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -12,8 +12,9 @@ ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} ; CHECK-NOT: s_or_b64 exec, exec ; CHECK: s_cbranch_execnz [[LOOP_LABEL]] -define void @test(i32 addrspace(1)* %out, i32 %cond) { +define void @test(i32 addrspace(1)* %out) { entry: + %cond = call i32 @llvm.r600.read.tidig.x() #0 %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %loop @@ -32,3 +33,7 @@ store i32 %inc, i32 addrspace(1)* %tmp3 ret void } + +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { readnone } Index: test/CodeGen/AMDGPU/i1-copy-implicit-def.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-implicit-def.ll +++ test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -4,9 +4,8 @@ ; SILowerI1Copies was not handling IMPLICIT_DEF ; SI-LABEL: {{^}}br_implicit_def: ; SI: BB#0: -; SI-NEXT: s_and_saveexec_b64 -; SI-NEXT: s_xor_b64 -; SI-NEXT: BB#1: +; SI-NEXT: s_and_b64 vcc, exec +; SI-NEXT: s_cbranch_vccnz define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { bb: br i1 undef, label %bb1, label %bb2 Index: test/CodeGen/AMDGPU/i1-copy-phi.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-phi.ll +++ test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -10,9 +10,11 @@ ; SI: s_and_saveexec_b64 ; SI: s_xor_b64 ; SI: s_endpgm -define void @br_i1_phi(i32 %arg, i1 %arg1) #0 { +define void @br_i1_phi(i32 %arg) { bb: - br i1 %arg1, label %bb2, label %bb3 + %tidig = call i32 @llvm.r600.read.tidig.x() #0 + %cmp = trunc i32 %tidig to i1 + br i1 %cmp, label %bb2, label %bb3 bb2: ; preds = %bb br label %bb3 @@ -28,3 +30,7 @@ bb6: ; preds = %bb4, %bb3 ret void } + +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { readnone } Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -21,3 +21,21 @@ } attributes #0 = { "ShaderType"="0" } + + +; CHECK: {{^}}branch_on_asm: +; Make sure inline assembly is treted as divergent. +; CHECK: s_mov_b32 s{{[0-9]+}}, 0 +; CHECK: s_and_saveexec_b64 +define void @branch_on_asm(i32 addrspace(1)* %out) { + %zero = call i32 asm "s_mov_b32 $0, 0", "=s"() + %cmp = icmp eq i32 %zero, 0 + br i1 %cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} Index: test/CodeGen/AMDGPU/madmk.ll =================================================================== --- test/CodeGen/AMDGPU/madmk.ll +++ test/CodeGen/AMDGPU/madmk.ll @@ -193,7 +193,9 @@ bb2: ; preds = %bb6, %bb %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ] - %tmp3 = fsub float undef, %tmp + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %f_tid = bitcast i32 %tid to float + %tmp3 = fsub float %f_tid, %tmp %tmp5 = fcmp oeq float %tmp3, 1.000000e+04 br i1 %tmp5, label %bb1, label %bb6 @@ -203,3 +205,7 @@ %tmp8 = fadd float %tmp7, undef br label %bb2 } + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 + +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -10,8 +10,8 @@ ; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] -; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] ; GCN-NOT: v_mov_b32 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -431,5 +431,33 @@ ret void } +; Make sure we legalize vopc operands after moving an sopc to the value. + +; {{^}}sopc_vopc_legalize_bug: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}} +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]] +; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-NOHSA: buffer_store_dword [[ONE]] +; GCN-HSA: flat_store_dword [[ONE]] +; GCN; {{^}}[[EXIT]]: +; GCN: s_endpgm +define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +bb3: ; preds = %bb2 + %tmp0 = bitcast i32 %cond to float + %tmp1 = fadd float %tmp0, 2.500000e-01 + %tmp2 = bitcast float %tmp1 to i32 + %tmp3 = icmp ult i32 %tmp2, %cond + br i1 %tmp3, label %bb6, label %bb7 + +bb6: + store i32 1, i32 addrspace(1)* %out + br label %bb7 + +bb7: ; preds = %bb3 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/setcc.ll =================================================================== --- test/CodeGen/AMDGPU/setcc.ll +++ test/CodeGen/AMDGPU/setcc.ll @@ -379,7 +379,7 @@ ; Make sure we don't try to emit i1 setcc ops ; FUNC-LABEL: setcc-i1 ; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1 -; SI: v_cmp_eq_i32_e64 s[0:1], 0, [[AND]] +; SI: s_cmp_eq_i32 [[AND]], 0 define void @setcc-i1(i32 %in) { %and = and i32 %in, 1 %cmp = icmp eq i32 %and, 0 Index: test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-annotate-cf.ll +++ test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -10,9 +10,10 @@ ; SI: s_andn2_b64 ; s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) { main_body: - %0 = and i32 %a, %b + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %0 = and i32 %a, %tid %1 = trunc i32 %0 to i1 br label %ENDIF @@ -39,9 +40,10 @@ ; SI: s_cbranch_execnz [[LOOP_LABEL]] ; SI: s_endpgm -define void @phi_cond_outside_loop(i32 %a, i32 %b) { +define void @phi_cond_outside_loop(i32 %b) { entry: - %0 = icmp eq i32 %a , 0 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %0 = icmp eq i32 %tid , 0 br i1 %0, label %if, label %else if: @@ -61,3 +63,7 @@ exit: ret void } + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-spill-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-cf.ll +++ test/CodeGen/AMDGPU/si-spill-cf.ll @@ -80,7 +80,8 @@ LOOP: ; preds = %ENDIF2795, %main_body %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] - %67 = icmp sgt i32 undef, 4 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %67 = icmp sgt i32 %tid, 4 br i1 %67, label %ENDLOOP, label %ENDIF ENDLOOP: ; preds = %ELSE2566, %LOOP @@ -228,13 +229,19 @@ %199 = fcmp olt float undef, %.temp292.9 %200 = and i1 %198, %199 %temp292.11 = select i1 %200, float undef, float %.temp292.9 - br i1 undef, label %IF2565, label %ELSE2566 + %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %cmp0 = icmp eq i32 %tid0, 0 + br i1 %cmp0, label %IF2565, label %ELSE2566 IF2565: ; preds = %ENDIF - br i1 false, label %ENDIF2582, label %ELSE2584 + %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %cmp1 = icmp eq i32 %tid1, 0 + br i1 %cmp1, label %ENDIF2582, label %ELSE2584 ELSE2566: ; preds = %ENDIF - %201 = fcmp oeq float %temp292.11, 1.000000e+04 + %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tidf = bitcast i32 %tid2 to float + %201 = fcmp oeq float %temp292.11, %tidf br i1 %201, label %ENDLOOP, label %ELSE2593 ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 @@ -248,7 +255,9 @@ %207 = fcmp ogt float undef, 0.000000e+00 %208 = fcmp olt float undef, 1.000000e+00 %209 = and i1 %207, %208 - %210 = fcmp olt float undef, %206 + %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tidf3 = bitcast i32 %tid3 to float + %210 = fcmp olt float %tidf3, %206 %211 = and i1 %209, %210 br i1 %211, label %ENDIF2795, label %ELSE2797 @@ -260,7 +269,9 @@ %213 = fadd float 0.000000e+00, %212 %floor = call float @llvm.floor.f32(float %213) %214 = fsub float %213, %floor - br i1 undef, label %IF2589, label %ELSE2590 + %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %cmp4 = icmp eq i32 %tid4, 0 + br i1 %cmp4, label %IF2589, label %ELSE2590 IF2589: ; preds = %ENDIF2582 br label %ENDIF2588 @@ -479,6 +490,8 @@ br label %ENDIF2795 } +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #2 + ; Function Attrs: nounwind readnone declare float @llvm.SI.load.const(<16 x i8>, i32) #2 Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s + +; GCN-FUNC: {{^}}vccz_workaround: +; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 +; GCN: v_cmp_neq_f32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0 +; GCN: s_and_b64 vcc, exec, [[MASK]] +; GCN: s_waitcnt lgkmcnt(0) +; VCCZ-BUG: s_mov_b64 vcc, vcc +; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc +; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] +; GCN: buffer_store_dword +; GCN: [[EXIT]]: +; GCN: s_endpgm +define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { +entry: + %cnd = fcmp oeq float 0.0, %cond + %sgpr = load volatile i32, i32 addrspace(2)* %in + br i1 %cnd, label %if, label %endif + +if: + store i32 %sgpr, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; GCN-FUNC: {{^}}vccz_noworkaround: +; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] +; GCN: buffer_store_dword +; GCN: [[EXIT]]: +; GCN: s_endpgm +define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %vgpr = load volatile float, float addrspace(1)* %in + %cnd = fcmp oeq float 0.0, %vgpr + br i1 %cnd, label %if, label %endif + +if: + store float %vgpr, float addrspace(1)* %out + br label %endif + +endif: + ret void +} Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -6,9 +6,11 @@ ; CHECK-LABEL: foobar: ; CHECK: s_load_dword s2, s[0:1], 0x9 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: ; CHECK: s_load_dword s6, s[0:1], 0xa @@ -23,7 +25,9 @@ define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 - br i1 undef, label %ift, label %ife + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %cnd = icmp eq i32 %tid, 0 + br i1 %cnd, label %ift, label %ife ift: %v1 = insertelement <4 x float> undef, float %a1, i32 0 @@ -35,3 +39,7 @@ store float %v2, float addrspace(1)* %out, align 4 ret void } + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -0,0 +1,365 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; SI-LABEL: {{^}}uniform_if_scc: +; SI-DAG: s_cmp_eq_i32 s{{[0-9]+}}, 0 +; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] + +; Fall-through to the else +; SI: v_mov_b32_e32 [[STORE_VAL]], 1 + +; SI: [[IF_LABEL]]: +; SI: buffer_store_dword [[STORE_VAL]] +define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %if, label %else + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}uniform_if_vcc: +; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and +; also scheduled the write first. +; SI: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI: s_and_b64 vcc, exec, [[COND]] +; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] + +; Fall-through to the else +; SI: v_mov_b32_e32 [[STORE_VAL]], 1 + +; SI: [[IF_LABEL]]: +; SI: buffer_store_dword [[STORE_VAL]] +define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = fcmp oeq float %cond, 0.0 + br i1 %cmp0, label %if, label %else + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}uniform_if_swap_br_targets_scc: +; SI-DAG: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]] + +; Fall-through to the else +; SI: v_mov_b32_e32 [[STORE_VAL]], 1 + +; SI: [[IF_LABEL]]: +; SI: buffer_store_dword [[STORE_VAL]] +define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc: +; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and +; also scheduled the write first. +; SI: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI: s_and_b64 vcc, exec, [[COND]] +; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] + +; Fall-through to the else +; SI: v_mov_b32_e32 [[STORE_VAL]], 1 + +; SI: [[IF_LABEL]]: +; SI: buffer_store_dword [[STORE_VAL]] +define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = fcmp oeq float %cond, 0.0 + br i1 %cmp0, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}uniform_if_move_valu: +; SI: v_add_f32_e32 [[CMP:v[0-9]+]] +; Using a floating-point value in an integer compare will cause the compare to +; be selected for the SALU and then later moved to the VALU. +; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]] +; SI: s_and_b64 vcc, exec, [[COND]] +; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: buffer_store_dword +; SI: [[ENDIF_LABEL]]: +; SI: s_endpgm +define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) { +entry: + %a.0 = fadd float %a, 10.0 + %cond = bitcast float %a.0 to i32 + %cmp = icmp eq i32 %cond, 5 + br i1 %cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; SI-LABEL: {{^}}uniform_if_move_valu_commute: +; SI: v_add_f32_e32 [[CMP:v[0-9]+]] +; Using a floating-point value in an integer compare will cause the compare to +; be selected for the SALU and then later moved to the VALU. +; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]] +; SI: s_and_b64 vcc, exec, [[COND]] +; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: buffer_store_dword +; SI: [[ENDIF_LABEL]]: +; SI: s_endpgm +define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) { +entry: + %a.0 = fadd float %a, 10.0 + %cond = bitcast float %a.0 to i32 + %cmp = icmp ugt i32 %cond, 5 + br i1 %cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + + +; SI-LABEL: {{^}}uniform_if_else: +; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; SI: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]] +; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; SI: buffer_store_dword [[ONE]] +; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: [[ELSE_LABEL]]: +; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; SI: buffer_store_dword [[TWO]] +; SI: [[ENDIF_LABEL]]: +; SI: s_endpgm +define void @uniform_if_else(i32 addrspace(1)* nocapture %out, i32 %a) { +entry: + %cmp = icmp eq i32 %a, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + store i32 1, i32 addrspace(1)* %out + br label %if.end + +if.else: ; preds = %entry + store i32 2, i32 addrspace(1)* %out + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; SI-LABEL: {{^}}icmp_2_users: +; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1 +; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]] +; SI: buffer_store_dword +; SI: [[LABEL]]: +; SI: s_endpgm +define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +main_body: + %0 = icmp sgt i32 %cond, 0 + %1 = sext i1 %0 to i32 + br i1 %0, label %IF, label %ENDIF + +IF: + store i32 %1, i32 addrspace(1)* %out + br label %ENDIF + +ENDIF: ; preds = %IF, %main_body + ret void +} + +; SI-LABEL: {{^}}icmp_users_different_blocks: +; SI: s_load_dword [[COND:s[0-9]+]] +; SI: s_cmp_lt_i32 [[COND]], 1 +; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] +; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]] +; SI: s_and_b64 vcc, exec, [[MASK]] +; SI: s_cbranch_vccnz [[EXIT]] +; SI: buffer_store +; SI: {{^}}[[EXIT]]: +; SI: s_endpgm +define void @icmp_users_different_blocks(i32 %cond, i32 addrspace(1)* %out) { +bb: + %tmp = tail call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = icmp sgt i32 %cond, 0 + br i1 %tmp1, label %bb2, label %bb9 + +bb2: ; preds = %bb + %tmp2 = sext i1 %tmp1 to i32 + %tmp3 = add i32 %tmp2, %tmp + br i1 %tmp1, label %bb9, label %bb7 + +bb7: ; preds = %bb5 + store i32 %tmp3, i32 addrspace(1)* %out + br label %bb9 + +bb9: ; preds = %bb8, %bb4 + ret void +} + +; SI-LABEL: {{^}}uniform_loop: +; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]: +; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we +; get s_add_i32 here. +; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}} +; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]] +; SI: s_and_b64 vcc, exec, vcc +; SI: s_cbranch_vccnz [[LOOP_LABEL]] +; SI: s_endpgm +define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) { +entry: + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i.i, %loop] + %i.i = add i32 %i, 1 + %cmp = icmp eq i32 %a, %i.i + br i1 %cmp, label %done, label %loop + +done: + ret void +} + +; Test uniform and divergent. + +; SI-LABEL: {{^}}uniform_inside_divergent: +; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] +; SI: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_cmp_lg_i32 {{s[0-9]+}}, 0 +; SI: s_cbranch_scc1 [[ENDIF_LABEL]] +; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; SI: buffer_store_dword [[ONE]] +define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + %u_cmp = icmp eq i32 %cond, 0 + br i1 %u_cmp, label %if_uniform, label %endif + +if_uniform: + store i32 1, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; SI-LABEL: {{^}}divergent_inside_uniform: +; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] +; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; SI: buffer_store_dword [[ONE]] +; SI: [[ENDIF_LABEL]]: +; SI: s_endpgm +define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { +entry: + %u_cmp = icmp eq i32 %cond, 0 + br i1 %u_cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if_uniform, label %endif + +if_uniform: + store i32 1, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; SI: {{^}}divergent_if_uniform_if: +; SI: v_cmp_eq_i32_e32 vcc, 0, v0 +; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] +; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; SI: buffer_store_dword [[ONE]] +; SI: s_or_b64 exec, exec, [[MASK]] +; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]] +; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; SI: buffer_store_dword [[TWO]] +; SI: [[EXIT]]: +; SI: s_endpgm +define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %d_cmp = icmp eq i32 %tid, 0 + br i1 %d_cmp, label %if, label %endif + +if: + store i32 1, i32 addrspace(1)* %out + br label %endif + +endif: + %u_cmp = icmp eq i32 %cond, 0 + br i1 %u_cmp, label %if_uniform, label %exit + +if_uniform: + store i32 2, i32 addrspace(1)* %out + br label %exit + +exit: + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { readnone } Index: test/CodeGen/AMDGPU/uniform-crash.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-crash.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: {{^}}icmp_2_users: +; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1 +; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]] +; GCN: [[LABEL]]: +; GCN-NEXT: s_endpgm +define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) { +main_body: + %0 = icmp sgt i32 %cond, 0 + %1 = sext i1 %0 to i32 + br i1 %0, label %IF, label %ENDIF + +IF: + store i32 %1, i32 addrspace(1)* %out + br label %ENDIF + +ENDIF: ; preds = %IF, %main_body + ret void +} + +; GCN-LABEL: {{^}}fix_sgpr_live_ranges_crash: +; GCN: s_cbranch_scc1 [[BB0:[A-Z0-9_]+]] +; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]: +; GCN: s_cbranch_scc1 [[LOOP]] +; GCN: {{^}}[[BB0]]: +define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) { +bb: + %cnd = trunc i32 %arg to i1 + br i1 %cnd, label %bb2, label %bb5 + +bb2: ; preds = %bb + %tmp = mul i32 10, %arg1 + br label %bb3 + +bb3: ; preds = %bb3, %bb2 + %tmp4 = icmp eq i32 undef, %arg1 + br i1 %tmp4, label %bb5, label %bb3 + +bb5: ; preds = %bb3, %bb + %tmp6 = tail call i32 @llvm.r600.read.tidig.y() #1 + %tmp10 = icmp ult i32 %tmp6, %arg + br i1 %tmp10, label %bb11, label %bb12 + +bb11: ; preds = %bb11, %bb5 + br i1 undef, label %bb11, label %bb12 + +bb12: ; preds = %bb11, %bb5 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -7,9 +7,10 @@ ; moved using VALU instructions ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 -define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: - switch i32 %a, label %default [ + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + switch i32 %tid, label %default [ i32 0, label %case0 i32 1, label %case1 ] @@ -25,7 +26,7 @@ br label %end default: - %cmp8 = icmp eq i32 %a, 2 + %cmp8 = icmp eq i32 %tid, 2 %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b br i1 %cmp8, label %if, label %else @@ -80,9 +81,11 @@ ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword ; SI-DAG: v_cmp_eq_i32_e32 vcc, -; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] -; SI: s_andn2_b64 exec, exec, [[OR_SREG]] -; SI: s_cbranch_execnz BB2_3 +; SI-DAG: s_and_b64 vcc, exec, vcc +; SI: s_cbranch_vccnz BB2_2 +; SI: s_branch BB2_3 +; SI: BB2_2: +; SI: s_endpgm define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { entry: Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -175,7 +175,8 @@ %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] %tmp142 = bitcast float %tmp95 to i32 - %tmp143 = icmp sgt i32 %tmp142, 125 + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %tmp143 = icmp sgt i32 %tmp142, %tid br i1 %tmp143, label %bb144, label %bb145 bb144: ; preds = %bb12 @@ -581,5 +582,7 @@ br label %bb12 } +declare i32 @llvm.r600.read.tidig.x() #1 + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -172,7 +172,8 @@ %tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ] %tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ] %tmp154 = bitcast float %tmp107 to i32 - %tmp155 = icmp sgt i32 %tmp154, 125 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %tmp155 = icmp sgt i32 %tmp154, %tid br i1 %tmp155, label %bb156, label %bb157 bb156: ; preds = %bb24 @@ -487,6 +488,8 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 + attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" } attributes #1 = { nounwind readnone }