Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -13,6 +13,8 @@ // better decisions. // //===----------------------------------------------------------------------===// +#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H +#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" @@ -46,3 +48,5 @@ DenseSet DivergentValues; }; } // End llvm namespace + +#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file Index: include/llvm/CodeGen/FunctionLoweringInfo.h =================================================================== --- include/llvm/CodeGen/FunctionLoweringInfo.h +++ include/llvm/CodeGen/FunctionLoweringInfo.h @@ -118,6 +118,17 @@ /// cross-basic-block values. DenseMap ValueMap; + /// VirtReg2Value map is needed by the Divergence Analysis driven + /// instruction selection. It is reverted ValueMap. It is computed + /// in lazy style - on demand. It is used to get the Value corresponding + /// to the live in virtual register and is called from the + /// TargetLowerinInfo::isSDNodeSourceOfDivergence. + DenseMap VirtReg2Value; + + /// This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence + /// to get the Value corresponding to the live-in virtual register. + const Value * getValueFromVirtualReg(unsigned Vreg); + /// Track virtual registers created for exception pointers. DenseMap CatchPadExceptionPointers; Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -28,8 +28,10 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineValueType.h" @@ -217,6 +219,9 @@ LLVMContext *Context; CodeGenOpt::Level OptLevel; + DivergenceAnalysis * DA = nullptr; + FunctionLoweringInfo * FLI = nullptr; + /// The function-level optimization remark emitter. Used to emit remarks /// whenever manipulating the DAG. OptimizationRemarkEmitter *ORE; @@ -346,19 +351,7 @@ .getRawSubclassData(); } - void createOperands(SDNode *Node, ArrayRef Vals) { - assert(!Node->OperandList && "Node already has operands"); - SDUse *Ops = OperandRecycler.allocate( - ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); - - for (unsigned I = 0; I != Vals.size(); ++I) { - Ops[I].setUser(Node); - Ops[I].setInitial(Vals[I]); - } - Node->NumOperands = Vals.size(); - Node->OperandList = Ops; - checkForCycles(Node); - } + void createOperands(SDNode *Node, ArrayRef Vals); void removeOperands(SDNode *Node) { if (!Node->OperandList) @@ -378,7 +371,12 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, - Pass *PassPtr, const TargetLibraryInfo *LibraryInfo); + Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, + DivergenceAnalysis * DA); + + void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { + FLI = FuncInfo; + } /// Clear state and free memory necessary to make this /// SelectionDAG ready to process a new block. Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -466,11 +466,14 @@ friend class SDNode; friend class MemIntrinsicSDNode; friend class MemSDNode; + friend class SelectionDAG; + friend class SelectionDAGBuilder; uint16_t HasDebugValue : 1; uint16_t IsMemIntrinsic : 1; + uint16_t IsDivergent : 1; }; - enum { NumSDNodeBits = 2 }; + enum { NumSDNodeBits = 3 }; class ConstantSDNodeBitfields { friend class ConstantSDNode; @@ -548,6 +551,8 @@ // TODO: unfriend HandleSDNode once we fix its operand handling. friend class HandleSDNode; + friend class SelectionDAGBuilder; + /// Unique id per SDNode in the DAG. int NodeId = -1; @@ -662,6 +667,8 @@ bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; } void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; } + bool isDivergent() const { return SDNodeBits.IsDivergent; } + /// Return true if there are no uses of this node. bool use_empty() const { return UseList == nullptr; } @@ -997,7 +1004,19 @@ /// This method should only be used by the SDUse class. void addUse(SDUse &U) { U.addToList(&UseList); } - + + void updateDivergence() { + bool IsDivergent = SDNodeBits.IsDivergent; + for (auto &Op : ops()) { + IsDivergent |= Op.getNode()->isDivergent(); + } + if (SDNodeBits.IsDivergent != IsDivergent) { + SDNodeBits.IsDivergent = IsDivergent; + for (auto U : uses()) { + U->updateDivergence(); + } + } + } protected: static SDVTList getSDVTList(EVT VT) { SDVTList Ret = { getValueTypeList(VT), 1 }; Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -29,6 +29,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineValueType.h" @@ -2556,6 +2557,15 @@ bool isPositionIndependent() const; + virtual bool isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const { + return false; + } + + virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { + return false; + } + /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if the node's address can be legally represented as /// pre-indexed load / store address. Index: lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -547,3 +547,13 @@ } return std::make_pair(It->second, false); } + +const Value * +FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) { + if (VirtReg2Value.empty()) { + for (auto &P : ValueMap) { + VirtReg2Value[P.second] = P.first; + } + } + return VirtReg2Value[Vreg]; +} Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -903,7 +903,8 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, - Pass *PassPtr, const TargetLibraryInfo *LibraryInfo) { + Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, + DivergenceAnalysis * Divergence) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; @@ -911,6 +912,7 @@ TSI = getSubtarget().getSelectionDAGInfo(); LibInfo = LibraryInfo; Context = &MF->getFunction().getContext(); + DA = Divergence; } SelectionDAG::~SelectionDAG() { @@ -7239,8 +7241,9 @@ SDUse &Use = UI.getUse(); ++UI; Use.set(To); + if (To->isDivergent() != From->isDivergent()) + User->updateDivergence(); } while (UI != UE && *UI == User); - // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7294,6 +7297,8 @@ SDUse &Use = UI.getUse(); ++UI; Use.setNode(To); + if (To->isDivergent() != From->isDivergent()) + User->updateDivergence(); } while (UI != UE && *UI == User); // Now that we have modified User, add it back to the CSE maps. If it @@ -7338,8 +7343,9 @@ const SDValue &ToOp = To[Use.getResNo()]; ++UI; Use.set(ToOp); + if (To->getNode()->isDivergent() != From->isDivergent()) + User->updateDivergence(); } while (UI != UE && *UI == User); - // Now that we have modified User, add it back to the CSE maps. If it // already exists there, recursively merge the results together. AddModifiedNodeToCSEMaps(User); @@ -7397,8 +7403,9 @@ ++UI; Use.set(To); + if (To->isDivergent() != From->isDivergent()) + User->updateDivergence(); } while (UI != UE && *UI == User); - // We are iterating over all uses of the From node, so if a use // doesn't use the specific value, no changes are made. if (!UserRemovedFromCSEMaps) @@ -8236,6 +8243,25 @@ return nullptr; } +void SelectionDAG::createOperands(SDNode *Node, ArrayRef Vals) { + assert(!Node->OperandList && "Node already has operands"); + SDUse *Ops = OperandRecycler.allocate( + ArrayRecycler::Capacity::get(Vals.size()), OperandAllocator); + + bool IsDivergent = false; + for (unsigned I = 0; I != Vals.size(); ++I) { + Ops[I].setUser(Node); + Ops[I].setInitial(Vals[I]); + IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent(); + } + Node->NumOperands = Vals.size(); + Node->OperandList = Ops; + IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA); + if (!TLI->isSDNodeAlwaysUniform(Node)) + Node->SDNodeBits.IsDivergent = IsDivergent; + checkForCycles(Node); +} + #ifndef NDEBUG static void checkForCyclesHelper(const SDNode *N, SmallPtrSetImpl &Visited, Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -628,6 +628,8 @@ if (getNodeId() != -1) OS << " [ID=" << getNodeId() << ']'; + if (!(isa(this) || (isa(this)))) + OS << "# D:" << isDivergent(); if (!G) return; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -414,7 +414,8 @@ SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); - CurDAG->init(*MF, *ORE, this, LibInfo); + CurDAG->init(*MF, *ORE, this, LibInfo, + getAnalysisIfAvailable()); FuncInfo->set(Fn, *MF, CurDAG); // Now get the optional analyzes if we want to. @@ -1401,6 +1402,8 @@ FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; FuncInfo->InsertPt = FuncInfo->MBB->begin(); + CurDAG->setFunctionLoweringInfo(FuncInfo); + if (!FastIS) { LowerArguments(Fn); } else { Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -83,6 +84,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -168,6 +168,11 @@ bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const; + + bool isSDNodeAlwaysUniform(const SDNode * N) const; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -748,6 +748,123 @@ return true; } +bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { + switch (N->getOpcode()) { + default: + return false; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + break; + } +} + +bool AMDGPUTargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const +{ + switch (N->getOpcode()) { + case ISD::CopyFromReg: + { + if (const RegisterSDNode *R = dyn_cast(N->getOperand(1))) + { + unsigned Reg = R->getReg(); + const AMDGPURegisterInfo * RI = Subtarget->getRegisterInfo(); + if (RI->isPhysicalRegister(Reg)) { + return RI->getRegClass(AMDGPU::VGPR_32RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_64RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_96RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_128RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_256RegClassID)->contains(Reg) || + RI->getRegClass(AMDGPU::VReg_512RegClassID)->contains(Reg); + } + else { + // Formal arguments of non-entry functions + // are conservatively considered divergent + if (FLI->RegInfo->isLiveIn(Reg) && + !AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + return DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + } + } + } + break; + case ISD::LOAD: + { + const LoadSDNode * L = dyn_cast(N); + if (L->getMemOperand()->getAddrSpace() == Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } + break; + case ISD::CALLSEQ_END: + return true; + break; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + return true; + } + } + break; + case ISD::INTRINSIC_W_CHAIN: + { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: + case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: + case Intrinsic::amdgcn_ds_swizzle: + return true; + } + } + break; + } + return false; +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5379,7 +5379,7 @@ unsigned NumElements = MemVT.getVectorNumElements(); if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - if (isMemOpUniform(Load)) + if (!Op->isDivergent()) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -5387,7 +5387,7 @@ // } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -223,11 +223,9 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && - static_cast(getTargetLowering())->isMemOpUniform(N)) || + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !N->isDivergent()) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && - static_cast(getTargetLowering())->isMemOpUniform(N) && + !Ld->isVolatile() && !N->isDivergent() && static_cast(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -2,7 +2,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_dispatch_ptr() #1 { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* @@ -19,7 +21,9 @@ } ; GCN-LABEL: {{^}}use_queue_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_queue_ptr() #1 { %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* @@ -37,11 +41,12 @@ } ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: -; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 +; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1] ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] - -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_queue_ptr_addrspacecast() #1 { %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32* store volatile i32 0, i32* %asc @@ -60,7 +65,9 @@ } ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @use_kernarg_segment_ptr() #1 { %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* @@ -424,9 +431,15 @@ ; GCN-LABEL: {{^}}use_every_sgpr_input: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: ; use s[12:13] ; GCN: ; use s14 ; GCN: ; use s15 @@ -554,15 +567,26 @@ ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9] ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11] -; GCN-DAG: s_mov_b32 s6, s14 +; GCN-DAG: s_mov_b32 s6, s14 ; GCN-DAG: s_mov_b32 s7, s15 ; GCN-DAG: s_mov_b32 s8, s16 + +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7] +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9] +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11] + ; GCN: s_swappc_b64 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 -; GCN: s_load_dword s{{[0-9]+}}, -; GCN: s_load_dword s{{[0-9]+}}, -; GCN: s_load_dword s{{[0-9]+}}, +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]] +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: ; use ; GCN: ; use [[SAVE_X]] ; GCN: ; use [[SAVE_Y]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -34,7 +34,13 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}} +; MESA: s_mov_b64 s[8:9], s[6:7] +; MESA: s_mov_b32 s11, 0xf000 +; MESA: s_mov_b32 s10, -1 +; MESA: buffer_load_dword v0, off, s[8:11], 0 +; HSA: v_mov_b32_e32 v0, s6 +; HSA: v_mov_b32_e32 v1, s7 +; HSA: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #1 { @@ -83,8 +89,21 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}} -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}} +; MESA: s_mov_b64 s[12:13], s[6:7] +; MESA: s_mov_b32 s15, 0xf000 +; MESA: s_mov_b32 s14, -1 +; MESA: buffer_load_dword v0, off, s[12:15], 0 +; HSA: v_mov_b32_e32 v0, s6 +; HSA: v_mov_b32_e32 v1, s7 +; HSA: flat_load_dword v0, v[0:1] +; MESA: s_mov_b32 s10, s14 +; MESA: s_mov_b32 s11, s15 +; MESA: buffer_load_dword v0, off, s[8:11], 0 +; HSA: v_mov_b32_e32 v0, s8 +; HSA: v_mov_b32_e32 v1, s9 +; HSA: flat_load_dword v0, v[0:1] + +; GCN: s_waitcnt vmcnt(0) define void @func_kernarg_implicitarg_ptr() #1 { %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -156,47 +156,39 @@ } ; GCN-LABEL: {{^}}restore_m0_lds: -; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] -; TOSMEM: s_cmp_eq_u32 + ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 +; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 s0, m0 -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, s0 -; TOSMEM: s_waitcnt lgkmcnt(0) - ; TOSMEM: ds_write_b64 ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x300 -; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload +; TOSMEM: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: s_mov_b32 m0, s2 ; TOSMEM: ; use m0 ; TOSMEM: s_dcache_wb ; TOSMEM: s_endpgm define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 - %sval = load volatile i64, i64 addrspace(2)* undef + %sval = load i64, i64 addrspace(2)* undef %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %ret, label %bb bb: - store volatile i64 %sval, i64 addrspace(3)* undef + store i64 %sval, i64 addrspace(3)* undef call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0 br label %ret