Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -349,12 +349,15 @@ SDUse *Ops = OperandRecycler.allocate( ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); + bool IsDivergent = false; for (unsigned I = 0; I != Vals.size(); ++I) { Ops[I].setUser(Node); Ops[I].setInitial(Vals[I]); + IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent(); } Node->NumOperands = Vals.size(); Node->OperandList = Ops; + Node->SDNodeBits.IsDivergent = IsDivergent; checkForCycles(Node); } Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -466,11 +466,14 @@ friend class SDNode; friend class MemIntrinsicSDNode; friend class MemSDNode; + friend class SelectionDAG; + friend class SelectionDAGBuilder; uint16_t HasDebugValue : 1; uint16_t IsMemIntrinsic : 1; + uint16_t IsDivergent : 1; }; - enum { NumSDNodeBits = 2 }; + enum { NumSDNodeBits = 3 }; class ConstantSDNodeBitfields { friend class ConstantSDNode; @@ -548,6 +551,8 @@ // TODO: unfriend HandleSDNode once we fix its operand handling. friend class HandleSDNode; + friend class SelectionDAGBuilder; + /// Unique id per SDNode in the DAG. int NodeId = -1; @@ -662,6 +667,8 @@ bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; } void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; } + bool isDivergent() const { return SDNodeBits.IsDivergent; } + /// Return true if there are no uses of this node. bool use_empty() const { return UseList == nullptr; } Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7207,7 +7207,7 @@ ++UI; Use.set(To); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->isDivergent(); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7263,6 +7263,8 @@ Use.setNode(To); } while (UI != UE && *UI == User); + User->SDNodeBits.IsDivergent |= To->isDivergent(); + // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7306,7 +7308,7 @@ ++UI; Use.set(ToOp); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->getNode()->isDivergent(); // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7365,7 +7367,7 @@ ++UI; Use.set(To); } while (UI != UE && *UI == User); - + User->SDNodeBits.IsDivergent |= To->isDivergent(); // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -22,6 +22,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" @@ -96,6 +97,8 @@ DenseMap NodeMap; + DivergenceAnalysis * DA; + /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used /// to preserve debug information for incoming arguments. DenseMap UnusedArgNodeMap; @@ -627,7 +630,7 @@ : SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), DAG(dag), FuncInfo(funcinfo) {} - void init(GCFunctionInfo *gfi, AliasAnalysis *AA, + void init(GCFunctionInfo *gfi, AliasAnalysis *AA, DivergenceAnalysis *DA, const TargetLibraryInfo *li); /// Clear out the current SelectionDAG and the associated state and prepare @@ -682,6 +685,7 @@ SDValue getValueImpl(const Value *V); void setValue(const Value *V, SDValue NewN) { + NewN.getNode()->SDNodeBits.IsDivergent = DA ? DA->isDivergent(V) : 0; SDValue &N = NodeMap[V]; assert(!N.getNode() && "Already set a value for this node!"); N = NewN; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -971,9 +971,11 @@ } void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, + DivergenceAnalysis *da, const TargetLibraryInfo *li) { AA = aa; GFI = gfi; + DA = da; LibInfo = li; DL = &DAG.getDataLayout(); Context = DAG.getContext(); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -608,6 +608,8 @@ if (getNodeId() != -1) OS << " [ID=" << getNodeId() << ']'; + if (!(isa(this) || (isa(this)))) + OS << "# D:" << isDivergent(); if (!G) return; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -432,7 +432,7 @@ else AA = nullptr; - SDB->init(GFI, AA, LibInfo); + SDB->init(GFI, AA, getAnalysisIfAvailable(), LibInfo); MF->setHasInlineAsm(false); Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -83,6 +84,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4716,7 +4716,7 @@ unsigned NumElements = MemVT.getVectorNumElements(); if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - if (isMemOpUniform(Load)) + if (!Op->isDivergent()) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -4724,7 +4724,7 @@ // } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -223,11 +223,9 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && - static_cast(getTargetLowering())->isMemOpUniform(N)) || + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !N->isDivergent()) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && - static_cast(getTargetLowering())->isMemOpUniform(N) && + !Ld->isVolatile() && !N->isDivergent() && static_cast(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -156,47 +156,39 @@ } ; GCN-LABEL: {{^}}restore_m0_lds: -; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] -; TOSMEM: s_cmp_eq_u32 + ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 +; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 s0, m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, s0 -; TOSMEM: s_waitcnt lgkmcnt(0) - ; TOSMEM: ds_write_b64 ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 -; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: s_mov_b32 m0, s2 ; TOSMEM: ; use m0 ; TOSMEM: s_dcache_wb ; TOSMEM: s_endpgm define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 - %sval = load volatile i64, i64 addrspace(2)* undef + %sval = load i64, i64 addrspace(2)* undef %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %ret, label %bb bb: - store volatile i64 %sval, i64 addrspace(3)* undef + store i64 %sval, i64 addrspace(3)* undef call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0 br label %ret