Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -351,12 +351,15 @@ SDUse *Ops = OperandRecycler.allocate( ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); + bool IsDivergent = false; for (unsigned I = 0; I != Vals.size(); ++I) { Ops[I].setUser(Node); Ops[I].setInitial(Vals[I]); + IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent(); } Node->NumOperands = Vals.size(); Node->OperandList = Ops; + Node->SDNodeBits.IsDivergent = IsDivergent; checkForCycles(Node); } @@ -463,6 +466,8 @@ return Root; } + void VerifyDAGDiverence(); + /// This iterates over the nodes in the SelectionDAG, folding /// certain types of nodes together, or eliminating superfluous nodes. The /// Level argument controls whether Combine is allowed to produce nodes and Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -466,11 +466,14 @@ friend class SDNode; friend class MemIntrinsicSDNode; friend class MemSDNode; + friend class SelectionDAG; + friend class SelectionDAGBuilder; uint16_t HasDebugValue : 1; uint16_t IsMemIntrinsic : 1; + uint16_t IsDivergent : 1; }; - enum { NumSDNodeBits = 2 }; + enum { NumSDNodeBits = 3 }; class ConstantSDNodeBitfields { friend class ConstantSDNode; @@ -548,6 +551,8 @@ // TODO: unfriend HandleSDNode once we fix its operand handling. friend class HandleSDNode; + friend class SelectionDAGBuilder; + /// Unique id per SDNode in the DAG. int NodeId = -1; @@ -662,6 +667,8 @@ bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; } void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; } + bool isDivergent() const { return SDNodeBits.IsDivergent; } + /// Return true if there are no uses of this node. bool use_empty() const { return UseList == nullptr; } Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2551,6 +2551,10 @@ bool isPositionIndependent() const; + virtual bool isSDNodeSourceOfDivergence(const SDNode * N) const { + return false; + } + /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if the node's address can be legally represented as /// pre-indexed load / store address. Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7235,7 +7235,7 @@ ++UI; Use.set(To); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->isDivergent(); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7291,6 +7291,8 @@ Use.setNode(To); } while (UI != UE && *UI == User); + User->SDNodeBits.IsDivergent |= To->isDivergent(); + // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7334,7 +7336,7 @@ ++UI; Use.set(ToOp); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->getNode()->isDivergent(); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7345,6 +7347,26 @@ setRoot(SDValue(To[getRoot().getResNo()])); } +void SelectionDAG::VerifyDAGDiverence() +{ + const TargetLowering &TLI = getTargetLoweringInfo(); + bool Changed = true; + while (Changed) { + Changed = false; + for (auto &N : allnodes()) { + bool IsDivergent = N.isDivergent(); + bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(&N); + for (auto &Op : N.ops()) { + IsSDNodeDivergent |= TLI.isSDNodeSourceOfDivergence(Op.getNode()); + } + if (!IsDivergent && IsSDNodeDivergent) { + N.SDNodeBits.IsDivergent = IsSDNodeDivergent; + Changed = true; + } + } + } +} + /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The Deleted /// vector is handled the same way as for ReplaceAllUsesWith. @@ -7393,7 +7415,7 @@ ++UI; Use.set(To); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->isDivergent(); // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -22,6 +22,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" @@ -96,6 +97,8 @@ DenseMap NodeMap; + DivergenceAnalysis * DA; + /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used /// to preserve debug information for incoming arguments. DenseMap UnusedArgNodeMap; @@ -627,7 +630,7 @@ : SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), DAG(dag), FuncInfo(funcinfo) {} - void init(GCFunctionInfo *gfi, AliasAnalysis *AA, + void init(GCFunctionInfo *gfi, AliasAnalysis *AA, DivergenceAnalysis *DA, const TargetLibraryInfo *li); /// Clear out the current SelectionDAG and the associated state and prepare @@ -682,6 +685,7 @@ SDValue getValueImpl(const Value *V); void setValue(const Value *V, SDValue NewN) { + NewN.getNode()->SDNodeBits.IsDivergent = DA ? DA->isDivergent(V) : 0; SDValue &N = NodeMap[V]; assert(!N.getNode() && "Already set a value for this node!"); N = NewN; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -971,9 +971,11 @@ } void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, + DivergenceAnalysis *da, const TargetLibraryInfo *li) { AA = aa; GFI = gfi; + DA = da; LibInfo = li; DL = &DAG.getDataLayout(); Context = DAG.getContext(); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -608,6 +608,8 @@ if (getNodeId() != -1) OS << " [ID=" << getNodeId() << ']'; + if (!(isa(this) || (isa(this)))) + OS << "# D:" << isDivergent(); if (!G) return; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -432,7 +432,7 @@ else AA = nullptr; - SDB->init(GFI, AA, LibInfo); + SDB->init(GFI, AA, getAnalysisIfAvailable(), LibInfo); MF->setHasInlineAsm(false); @@ -860,6 +860,7 @@ if (ViewISelDAGs && MatchFilterBB) CurDAG->viewGraph("isel input for " + BlockName); + CurDAG->VerifyDAGDiverence(); // Third, instruction select all of the operations to machine code, adding the // code to the MachineBasicBlock. { Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -83,6 +84,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -168,6 +168,8 @@ bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool isSDNodeSourceOfDivergence(const SDNode * N) const; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -748,6 +748,95 @@ return true; } +bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N) const +{ + switch (N->getOpcode()) { + case ISD::CopyFromReg: + { + if (const RegisterSDNode *R = dyn_cast(N->getOperand(1))) + { + unsigned Reg = R->getReg(); + const AMDGPURegisterInfo * RI = Subtarget->getRegisterInfo(); + if (RI->isPhysicalRegister(Reg)) { + return RI->getRegClass(AMDGPU::VGPR_32RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_64RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_96RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_128RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_256RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_512RegClassID)->contains(Reg); + } + } + } + break; + case ISD::LOAD: + { + const LoadSDNode * L = dyn_cast(N); + if (L->getMemOperand()->getAddrSpace() == Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } + break; + case ISD::CALLSEQ_END: + return true; + break; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + return true; + } + } + break; + case ISD::INTRINSIC_W_CHAIN: + { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: + case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: + case Intrinsic::amdgcn_ds_swizzle: + return true; + } + } + break; + } + return false; +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5014,7 +5014,7 @@ unsigned NumElements = MemVT.getVectorNumElements(); if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - if (isMemOpUniform(Load)) + if (!Op->isDivergent()) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -5022,7 +5022,7 @@ // } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -223,11 +223,9 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && - static_cast(getTargetLowering())->isMemOpUniform(N)) || + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !N->isDivergent()) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && - static_cast(getTargetLowering())->isMemOpUniform(N) && + !Ld->isVolatile() && !N->isDivergent() && static_cast(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -156,47 +156,39 @@ } ; GCN-LABEL: {{^}}restore_m0_lds: -; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] -; TOSMEM: s_cmp_eq_u32 + ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 +; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 s0, m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, s0 -; TOSMEM: s_waitcnt lgkmcnt(0) - ; TOSMEM: ds_write_b64 ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 -; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: s_mov_b32 m0, s2 ; TOSMEM: ; use m0 ; TOSMEM: s_dcache_wb ; TOSMEM: s_endpgm define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 - %sval = load volatile i64, i64 addrspace(2)* undef + %sval = load i64, i64 addrspace(2)* undef %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %ret, label %bb bb: - store volatile i64 %sval, i64 addrspace(3)* undef + store i64 %sval, i64 addrspace(3)* undef call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0 br label %ret