Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -13,6 +13,8 @@ // better decisions. // //===----------------------------------------------------------------------===// +#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H +#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" @@ -46,3 +48,5 @@ DenseSet DivergentValues; }; } // End llvm namespace + +#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file Index: include/llvm/CodeGen/FunctionLoweringInfo.h =================================================================== --- include/llvm/CodeGen/FunctionLoweringInfo.h +++ include/llvm/CodeGen/FunctionLoweringInfo.h @@ -118,6 +118,17 @@ /// cross-basic-block values. DenseMap ValueMap; + /// VirtReg2Value map is needed by the Divergence Analysis driven + /// instruction selection. It is reverted ValueMap. It is computed + /// in lazy style - on demand. It is used to get the Value corresponding + /// to the live in virtual register and is called from the + /// TargetLowerinInfo::isSDNodeSourceOfDivergence. + DenseMap VirtReg2Value; + + /// This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence + /// to get the Value corresponding to the live-in virtual register. + const Value * getValueFromVirtualReg(unsigned Vreg); + /// Track virtual registers created for exception pointers. DenseMap CatchPadExceptionPointers; Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -28,8 +28,10 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineValueType.h" @@ -217,6 +219,9 @@ LLVMContext *Context; CodeGenOpt::Level OptLevel; + DivergenceAnalysis * DA = nullptr; + FunctionLoweringInfo * FLI = nullptr; + /// The function-level optimization remark emitter. Used to emit remarks /// whenever manipulating the DAG. OptimizationRemarkEmitter *ORE; @@ -346,19 +351,7 @@ .getRawSubclassData(); } - void createOperands(SDNode *Node, ArrayRef Vals) { - assert(!Node->OperandList && "Node already has operands"); - SDUse *Ops = OperandRecycler.allocate( - ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); - - for (unsigned I = 0; I != Vals.size(); ++I) { - Ops[I].setUser(Node); - Ops[I].setInitial(Vals[I]); - } - Node->NumOperands = Vals.size(); - Node->OperandList = Ops; - checkForCycles(Node); - } + void createOperands(SDNode *Node, ArrayRef Vals); void removeOperands(SDNode *Node) { if (!Node->OperandList) @@ -369,7 +362,7 @@ Node->NumOperands = 0; Node->OperandList = nullptr; } - + void CreateTopologicalOrder(std::vector& Order); public: explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level); SelectionDAG(const SelectionDAG &) = delete; @@ -378,7 +371,12 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, - Pass *PassPtr, const TargetLibraryInfo *LibraryInfo); + Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, + DivergenceAnalysis * DA); + + void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { + FLI = FuncInfo; + } /// Clear state and free memory necessary to make this /// SelectionDAG ready to process a new block. @@ -463,6 +461,8 @@ return Root; } + void VerifyDAGDiverence(); + /// This iterates over the nodes in the SelectionDAG, folding /// certain types of nodes together, or eliminating superfluous nodes. The /// Level argument controls whether Combine is allowed to produce nodes and @@ -1128,6 +1128,9 @@ SDValue Op3, SDValue Op4, SDValue Op5); SDNode *UpdateNodeOperands(SDNode *N, ArrayRef Ops); + // Propagates the change in divergence to users + void updateDivergence(SDNode * N); + /// These are used for target selectors to *mutate* the /// specified node to have the specified return type, Target opcode, and /// operands. Note that target opcodes are stored as Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -466,11 +466,13 @@ friend class SDNode; friend class MemIntrinsicSDNode; friend class MemSDNode; + friend class SelectionDAG; uint16_t HasDebugValue : 1; uint16_t IsMemIntrinsic : 1; + uint16_t IsDivergent : 1; }; - enum { NumSDNodeBits = 2 }; + enum { NumSDNodeBits = 3 }; class ConstantSDNodeBitfields { friend class ConstantSDNode; @@ -662,6 +664,8 @@ bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; } void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; } + bool isDivergent() const { return SDNodeBits.IsDivergent; } + /// Return true if there are no uses of this node. bool use_empty() const { return UseList == nullptr; } Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -29,6 +29,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineValueType.h" @@ -2562,6 +2563,16 @@ bool isPositionIndependent() const; + virtual bool isSDNodeSourceOfDivergence(const SDNode *N, + FunctionLoweringInfo *FLI, + DivergenceAnalysis *DA) const { + return false; + } + + virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { + return false; + } + /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if the node's address can be legally represented as /// pre-indexed load / store address. Index: lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -547,3 +547,13 @@ } return std::make_pair(It->second, false); } + +const Value * +FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) { + if (VirtReg2Value.empty()) { + for (auto &P : ValueMap) { + VirtReg2Value[P.second] = P.first; + } + } + return VirtReg2Value[Vreg]; +} Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -950,7 +950,8 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, - Pass *PassPtr, const TargetLibraryInfo *LibraryInfo) { + Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, + DivergenceAnalysis * Divergence) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -958,6 +959,7 @@ TSI = getSubtarget().getSelectionDAGInfo(); LibInfo = LibraryInfo; Context = &MF->getFunction().getContext(); + DA = Divergence; } SelectionDAG::~SelectionDAG() { @@ -1713,6 +1715,7 @@ return SDValue(E, 0); auto *N = newSDNode(RegNo, VT); + N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); @@ -6699,6 +6702,7 @@ if (N->OperandList[1] != Op2) N->OperandList[1].set(Op2); + updateDivergence(N); // If this gets put into a CSE map, add it. if (InsertPos) CSEMap.InsertNode(N, InsertPos); return N; @@ -7340,8 +7344,9 @@ SDUse &Use = UI.getUse(); ++UI; Use.set(To); + if (To->isDivergent() != From->isDivergent()) + updateDivergence(User); } while (UI != UE && *UI == User); - // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7395,6 +7400,8 @@ SDUse &Use = UI.getUse(); ++UI; Use.setNode(To); + if (To->isDivergent() != From->isDivergent()) + updateDivergence(User); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it @@ -7439,8 +7446,9 @@ const SDValue &ToOp = To[Use.getResNo()]; ++UI; Use.set(ToOp); + if (To->getNode()->isDivergent() != From->isDivergent()) + updateDivergence(User); } while (UI != UE && *UI == User); - // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7498,8 +7506,9 @@ ++UI; Use.set(To); + if (To->isDivergent() != From->isDivergent()) + updateDivergence(User); } while (UI != UE && *UI == User); - // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) @@ -7532,6 +7541,70 @@ } // end anonymous namespace +void SelectionDAG::updateDivergence(SDNode * N) +{ + if (TLI->isSDNodeAlwaysUniform(N)) + return; + bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA); + for (auto &Op : N->ops()) { + if (Op.Val.getValueType() != MVT::Other) + IsDivergent |= Op.getNode()->isDivergent(); + } + if (N->SDNodeBits.IsDivergent != IsDivergent) { + N->SDNodeBits.IsDivergent = IsDivergent; + for (auto U : N->uses()) { + updateDivergence(U); + } + } +} + + +void SelectionDAG::CreateTopologicalOrder(std::vector& Order) { + DenseMap Degree; + Order.reserve(AllNodes.size()); + for (auto & N : allnodes()) { + unsigned NOps = N.getNumOperands(); + Degree[&N] = NOps; + if (0 == NOps) + Order.push_back(&N); + } + for (std::vector::iterator I = Order.begin(); + I!=Order.end();++I) { + SDNode * N = *I; + for (auto U : N->uses()) { + unsigned &UnsortedOps = Degree[U]; + if (0 == --UnsortedOps) + Order.push_back(U); + } + } +} + +void SelectionDAG::VerifyDAGDiverence() +{ + std::vector TopoOrder; + CreateTopologicalOrder(TopoOrder); + const TargetLowering &TLI = getTargetLoweringInfo(); + DenseMap DivergenceMap; + for (auto &N : allnodes()) { + DivergenceMap[&N] = false; + } + for (auto N : TopoOrder) { + bool IsDivergent = DivergenceMap[N]; + bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA); + for (auto &Op : N->ops()) { + if (Op.Val.getValueType() != MVT::Other) + IsSDNodeDivergent |= DivergenceMap[Op.getNode()]; + } + if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) { + DivergenceMap[N] = true; + } + } + for (auto &N : allnodes()) { + assert(DivergenceMap[&N] == N.isDivergent() && "Divergence bit inconsistency detected\n"); + } +} + + /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The same value /// may appear in both the From and To list. The Deleted vector is @@ -8337,6 +8410,26 @@ return nullptr; } +void SelectionDAG::createOperands(SDNode *Node, ArrayRef Vals) { + assert(!Node->OperandList && "Node already has operands"); + SDUse *Ops = OperandRecycler.allocate( + ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); + + bool IsDivergent = false; + for (unsigned I = 0; I != Vals.size(); ++I) { + Ops[I].setUser(Node); + Ops[I].setInitial(Vals[I]); + if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence. + IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent(); + } + Node->NumOperands = Vals.size(); + Node->OperandList = Ops; + IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA); + if (!TLI->isSDNodeAlwaysUniform(Node)) + Node->SDNodeBits.IsDivergent = IsDivergent; + checkForCycles(Node); +} + #ifndef NDEBUG static void checkForCyclesHelper(const SDNode *N, SmallPtrSetImpl &Visited, Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -629,6 +629,8 @@ if (getNodeId() != -1) OS << " [ID=" << getNodeId() << ']'; + if (!(isa(this) || (isa(this)))) + OS << "# D:" << isDivergent(); if (!G) return; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" @@ -329,6 +330,7 @@ AU.addPreserved(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); if (UseMBPI && OptLevel != CodeGenOpt::None) AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -414,7 +416,8 @@ SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); - CurDAG->init(*MF, *ORE, this, LibInfo); + CurDAG->init(*MF, *ORE, this, LibInfo, + getAnalysisIfAvailable()); FuncInfo->set(Fn, *MF, CurDAG); // Now get the optional analyzes if we want to. @@ -711,6 +714,8 @@ int BlockNumber = -1; (void)BlockNumber; bool MatchFilterBB = false; (void)MatchFilterBB; + TargetTransformInfo &TTI = + getAnalysis().getTTI(*FuncInfo->Fn); // Pre-type legalization allow creation of any node types. CurDAG->NewNodesMustHaveLegalTypes = false; @@ -744,6 +749,9 @@ CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel); } + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); + DEBUG(dbgs() << "Optimized lowered selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; @@ -761,6 +769,9 @@ Changed = CurDAG->LegalizeTypes(); } + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); + DEBUG(dbgs() << "Type-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; @@ -780,6 +791,9 @@ CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel); } + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); + DEBUG(dbgs() << "Optimized type-legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; @@ -823,6 +837,9 @@ << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; CurDAG->dump()); + + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); } if (ViewLegalizeDAGs && MatchFilterBB) @@ -834,6 +851,9 @@ CurDAG->Legalize(); } + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); + DEBUG(dbgs() << "Legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; @@ -849,6 +869,9 @@ CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel); } + if (TTI.hasBranchDivergence()) + CurDAG->VerifyDAGDiverence(); + DEBUG(dbgs() << "Optimized legalized selection DAG: " << printMBBReference(*FuncInfo->MBB) << " '" << BlockName << "'\n"; @@ -1401,6 +1424,8 @@ FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; FuncInfo->InsertPt = FuncInfo->MBB->begin(); + CurDAG->setFunctionLoweringInfo(FuncInfo); + if (!FastIS) { LowerArguments(Fn); } else { Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -83,6 +84,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -168,6 +168,9 @@ bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const; + bool isSDNodeAlwaysUniform(const SDNode * N) const; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -25,6 +25,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -748,6 +749,101 @@ return true; } +bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { + switch (N->getOpcode()) { + default: + return false; + case ISD::EntryToken: + case ISD::TokenFactor: + return true; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + break; + case ISD::LOAD: + { + const LoadSDNode * L = dyn_cast(N); + if (L->getMemOperand()->getAddrSpace() + == Subtarget->getAMDGPUAS().CONSTANT_ADDRESS_32BIT) + return true; + return false; + } + break; + } +} + +bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const +{ + switch (N->getOpcode()) { + case ISD::Register: + case ISD::CopyFromReg: + { + const RegisterSDNode *R = nullptr; + if (N->getOpcode() == ISD::Register) { + R = dyn_cast(N); + } + else { + R = dyn_cast(N->getOperand(1)); + } + if (R) + { + const MachineFunction * MF = FLI->MF; + const SISubtarget &ST = MF->getSubtarget(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return TRI.isVGPR(MRI, Reg); + + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + if ((MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_X) || + (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Y) || + (MRI.getLiveInPhysReg(Reg) == AMDGPU::T0_Z)|| + (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR0) || + (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR1) || + (MRI.getLiveInPhysReg(Reg) == AMDGPU::VGPR2)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + } + return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + } + } + break; + case ISD::LOAD: { + const LoadSDNode *L = dyn_cast(N); + if (L->getMemOperand()->getAddrSpace() == + Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } break; + case ISD::CALLSEQ_END: + return true; + break; + case ISD::INTRINSIC_WO_CHAIN: + { + + } + return AMDGPU::isIntrinsicSourceOfDivergence( + cast(N->getOperand(0))->getZExtValue()); + case ISD::INTRINSIC_W_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast(N->getOperand(1))->getZExtValue()); + } + return false; +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -464,55 +465,7 @@ } } -static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { - switch (I->getIntrinsicID()) { - case Intrinsic::amdgcn_workitem_id_x: - case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::amdgcn_interp_mov: - case Intrinsic::amdgcn_interp_p1: - case Intrinsic::amdgcn_interp_p2: - case Intrinsic::amdgcn_mbcnt_hi: - case Intrinsic::amdgcn_mbcnt_lo: - case Intrinsic::r600_read_tidig_x: - case Intrinsic::r600_read_tidig_y: - case Intrinsic::r600_read_tidig_z: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: - case Intrinsic::amdgcn_image_atomic_swap: - case Intrinsic::amdgcn_image_atomic_add: - case Intrinsic::amdgcn_image_atomic_sub: - case Intrinsic::amdgcn_image_atomic_smin: - case Intrinsic::amdgcn_image_atomic_umin: - case Intrinsic::amdgcn_image_atomic_smax: - case Intrinsic::amdgcn_image_atomic_umax: - case Intrinsic::amdgcn_image_atomic_and: - case Intrinsic::amdgcn_image_atomic_or: - case Intrinsic::amdgcn_image_atomic_xor: - case Intrinsic::amdgcn_image_atomic_inc: - case Intrinsic::amdgcn_image_atomic_dec: - case Intrinsic::amdgcn_image_atomic_cmpswap: - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: - case Intrinsic::amdgcn_buffer_atomic_cmpswap: - case Intrinsic::amdgcn_ps_live: - case Intrinsic::amdgcn_ds_swizzle: - return true; - default: - return false; - } -} + static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -563,7 +516,7 @@ return true; if (const IntrinsicInst *Intrinsic = dyn_cast(V)) - return isIntrinsicSourceOfDivergence(Intrinsic); + return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); // Assume all function calls are a source of divergence. if (isa(V) || isa(V)) Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5372,7 +5372,7 @@ unsigned NumElements = MemVT.getVectorNumElements(); if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { - if (isMemOpUniform(Load)) + if (!Op->isDivergent()) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -5382,7 +5382,7 @@ if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -223,12 +223,9 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - (((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && - static_cast(getTargetLowering())->isMemOpUniform(N)) || + ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && - static_cast(getTargetLowering())->isMemOpUniform(N) && + !Ld->isVolatile() && !N->isDivergent() && static_cast(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -382,6 +382,9 @@ /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +/// \returns true if the intrinsic is divergent +bool isIntrinsicSourceOfDivergence(unsigned IntrID); + } // end namespace AMDGPU } // end namespace llvm Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPUTargetTransformInfo.h" #include "AMDGPU.h" #include "SIDefines.h" #include "llvm/ADT/StringRef.h" @@ -938,5 +939,55 @@ AMDGPUAS getAMDGPUAS(const Module &M) { return getAMDGPUAS(Triple(M.getTargetTriple())); } + +bool isIntrinsicSourceOfDivergence(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: + case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: + case Intrinsic::amdgcn_ds_swizzle: + return true; + default: + return false; + } +} } // namespace AMDGPU } // namespace llvm Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -2,7 +2,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_dispatch_ptr() #1 { %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* @@ -19,7 +21,9 @@ } ; GCN-LABEL: {{^}}use_queue_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_queue_ptr() #1 { %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* @@ -37,11 +41,12 @@ } ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: -; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 +; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1] ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] - -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_queue_ptr_addrspacecast() #1 { %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32* store volatile i32 0, i32* %asc @@ -60,7 +65,9 @@ } ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_kernarg_segment_ptr() #1 { %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* @@ -424,9 +431,15 @@ ; GCN-LABEL: {{^}}use_every_sgpr_input: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: ; use s[12:13] ; GCN: ; use s14 ; GCN: ; use s15 @@ -557,12 +570,23 @@ ; GCN-DAG: s_mov_b32 s6, s14 ; GCN-DAG: s_mov_b32 s7, s15 ; GCN-DAG: s_mov_b32 s8, s16 + +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7] +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9] +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11] + ; GCN: s_swappc_b64 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 -; GCN: s_load_dword s{{[0-9]+}}, -; GCN: s_load_dword s{{[0-9]+}}, -; GCN: s_load_dword s{{[0-9]+}}, +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: ; use ; GCN: ; use [[SAVE_X]] ; GCN: ; use [[SAVE_Y]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -34,7 +34,13 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}} +; MESA: s_mov_b64 s[8:9], s[6:7] +; MESA: s_mov_b32 s11, 0xf000 +; MESA: s_mov_b32 s10, -1 +; MESA: buffer_load_dword v0, off, s[8:11], 0 +; HSA: v_mov_b32_e32 v0, s6 +; HSA: v_mov_b32_e32 v1, s7 +; HSA: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #1 { @@ -83,8 +89,21 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}} -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}} +; MESA: s_mov_b64 s[12:13], s[6:7] +; MESA: s_mov_b32 s15, 0xf000 +; MESA: s_mov_b32 s14, -1 +; MESA: buffer_load_dword v0, off, s[12:15], 0 +; HSA: v_mov_b32_e32 v0, s6 +; HSA: v_mov_b32_e32 v1, s7 +; HSA: flat_load_dword v0, v[0:1] +; MESA: s_mov_b32 s10, s14 +; MESA: s_mov_b32 s11, s15 +; MESA: buffer_load_dword v0, off, s[8:11], 0 +; HSA: v_mov_b32_e32 v0, s8 +; HSA: v_mov_b32_e32 v1, s9 +; HSA: flat_load_dword v0, v[0:1] + +; GCN: s_waitcnt vmcnt(0) define void @func_kernarg_implicitarg_ptr() #1 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()