diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -7,10 +7,10 @@ //===----------------------------------------------------------------------===// // /// \file -/// This pass tries to remove unnecessary VGPR live range in divergent if-else -/// structure. +/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else +/// structures and waterfall loops. /// -/// When we do structurization, we usually transform a if-else into two +/// When we do structurization, we usually transform an if-else into two /// sucessive if-then (with a flow block to do predicate inversion). Consider a /// simple case after structurization: A divergent value %a was defined before /// if-else and used in both THEN (use in THEN is optional) and ELSE part: @@ -29,10 +29,10 @@ /// /// As register allocator has no idea of the thread-control-flow, it will just /// assume %a would be alive in the whole range of bb.then because of a later -/// use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect +/// use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect /// to exec mask. For this if-else case, the lanes active in bb.then will be -/// inactive in bb.else, and vice-verse. So we are safe to say that %a was dead -/// after the last use in bb.then untill the end of the block. The reason is +/// inactive in bb.else, and vice-versa. So we are safe to say that %a was dead +/// after the last use in bb.then until the end of the block. The reason is /// the instructions in bb.then will only overwrite lanes that will never be /// accessed in bb.else. /// @@ -46,6 +46,28 @@ /// sure the second loop iteration still get correct data. /// 2.) There should be no further uses after the IF-ELSE region. /// +/// +/// Waterfall loops get inserted around instructions that use divergent values +/// but can only be executed with a uniform value. For example an indirect call +/// to a divergent address: +/// bb.start: +/// %a = ... +/// %fun = ... +/// ... +/// bb.loop: +/// call %fun (%a) +/// ... // %a can be dead here +/// loop %bb.loop +/// +/// The loop block is executed multiple times, but it is run exactly once for +/// each active lane. Similar to the if-else case, the register allocator +/// assumes that %a is live throughout the loop as it is used again in the next +/// iteration. If %a is a VGPR that is unused after the loop, it does not need +/// to be live after its last use in the loop block. By inserting a phi-node at +/// the start of bb.loop that is undef when coming from bb.loop, the register +/// allocation knows that the value of %a does not need to be preserved through +/// iterations of the loop. +/// // //===----------------------------------------------------------------------===// @@ -89,6 +111,10 @@ SmallSetVector &ElseBlocks, SmallVectorImpl &CandidateRegs) const; + void collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector &CandidateRegs) const; + void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, SmallVectorImpl &Uses) const; @@ -105,6 +131,8 @@ MachineBasicBlock *Flow, MachineBasicBlock *Endif, SmallSetVector &ElseBlocks) const; + void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const; + SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -278,6 +306,54 @@ } } +/// Collect the registers used in the waterfall loop block that are defined +/// before. +void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector &CandidateRegs) const { + + for (auto &MI : Loop->instrs()) { + if (MI.isDebugInstr()) + continue; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg() || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize AGPR/VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg)) + continue; + + if (MO.readsReg()) { + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure the value is defined before the LOOP block + if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) { + // If the variable is used after the loop, the register coalescer will + // merge the newly created register and remove the phi node again. + // Just do nothing in that case. + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg); + bool IsUsed = false; + for (auto *Succ : Loop->successors()) { + if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { + IsUsed = true; + break; + } + } + if (!IsUsed) { + LLVM_DEBUG(dbgs() << "Found candidate reg: " + << printReg(MOReg, TRI, 0, MRI) << '\n'); + CandidateRegs.insert(MOReg); + } else { + LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: " + << printReg(MOReg, TRI, 0, MRI) << '\n'); + } + } + } + } + } +} + // Re-calculate the liveness of \p Reg in the THEN-region void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { @@ -403,12 +479,8 @@ } // Replace all uses in the ELSE region or the PHIs in ENDIF block - for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) { - MachineOperand &O = *I; - // This is a little bit tricky, the setReg() will update the linked list, - // so we have to increment the iterator before setReg() to avoid skipping - // some uses. - ++I; + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { auto *UseMI = O.getParent(); auto *UseBlock = UseMI->getParent(); // Replace uses in Endif block @@ -431,6 +503,53 @@ updateLiveRangeInThenRegion(Reg, If, Flow); } +void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( + Register Reg, MachineBasicBlock *Loop) const { + // Insert a new PHI, marking the value from the last loop iteration undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); + const auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + + // Replace all uses in the LOOP region + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Loop block + if (UseBlock == Loop) + O.setReg(NewReg); + } + + MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Loop->predecessors()) { + if (Pred == Loop) + PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); + else + PHI.addReg(Reg).addMBB(Pred); + } + + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + + // collectWaterfallCandidateRegisters only collects registers that are dead + // after the loop. So we know that the old reg is not live throughout the + // whole block anymore. + OldVarInfo.AliveBlocks.reset(Loop->getNumber()); + + // Mark the last use as kill + for (auto &MI : reverse(Loop->instrs())) { + if (MI.readsRegister(NewReg, TRI)) { + MI.addRegisterKilled(NewReg, TRI); + NewVarInfo.Kills.push_back(&MI); + break; + } + } + assert(!NewVarInfo.Kills.empty() && + "Failed to find last usage of register in loop"); +} + char SIOptimizeVGPRLiveRange::ID = 0; INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, @@ -491,6 +610,16 @@ // Now we are safe to optimize. for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); + } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { + LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " + << printMBBReference(MBB) << '\n'); + + SmallSetVector CandidateRegs; + collectWaterfallCandidateRegisters(&MBB, CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeWaterfallLiveRange(Reg, &MBB); } } } diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll --- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll @@ -24,6 +24,8 @@ ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] ; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]] define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) { @@ -48,6 +50,8 @@ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] ; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]] define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) { diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -202,32 +202,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -235,13 +231,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -250,36 +244,34 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB2_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -292,32 +284,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -325,13 +313,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] @@ -341,36 +327,34 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB3_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -383,32 +367,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -416,13 +396,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -431,37 +409,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB4_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -475,32 +452,28 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 19 +; GCN-NEXT: v_writelane_b32 v40, s33, 19 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s50, 15 -; GCN-NEXT: v_writelane_b32 v43, s51, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s50, 15 +; GCN-NEXT: v_writelane_b32 v40, s51, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -508,20 +481,18 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_cbranch_execz BB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_writelane_b32 v43, s30, 17 -; GCN-NEXT: v_writelane_b32 v43, s31, 18 +; GCN-NEXT: v_writelane_b32 v40, s30, 17 +; GCN-NEXT: v_writelane_b32 v40, s31, 18 ; GCN-NEXT: s_mov_b64 s[48:49], exec ; GCN-NEXT: BB5_2: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -530,40 +501,38 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] ; GCN-NEXT: s_cbranch_execnz BB5_2 ; GCN-NEXT: ; %bb.3: ; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: v_readlane_b32 s30, v43, 17 -; GCN-NEXT: v_readlane_b32 s31, v43, 18 +; GCN-NEXT: v_readlane_b32 s30, v40, 17 +; GCN-NEXT: v_readlane_b32 s31, v40, 18 ; GCN-NEXT: BB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v43, 16 -; GCN-NEXT: v_readlane_b32 s50, v43, 15 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 19 +; GCN-NEXT: v_readlane_b32 s51, v40, 16 +; GCN-NEXT: v_readlane_b32 s50, v40, 15 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 19 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -583,48 +552,145 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v42, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 6 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v42, s34, 0 -; GCN-NEXT: v_writelane_b32 v42, s35, 1 -; GCN-NEXT: v_writelane_b32 v42, s36, 2 -; GCN-NEXT: v_writelane_b32 v42, s37, 3 -; GCN-NEXT: v_writelane_b32 v42, s30, 4 -; GCN-NEXT: v_writelane_b32 v42, s31, 5 -; GCN-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 ; GCN-NEXT: s_mov_b64 s[34:35], exec ; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v40 -; GCN-NEXT: v_readfirstlane_b32 s7, v41 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41] +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_readlane_b32 s4, v42, 4 -; GCN-NEXT: v_readlane_b32 s5, v42, 5 -; GCN-NEXT: v_readlane_b32 s37, v42, 3 -; GCN-NEXT: v_readlane_b32 s36, v42, 2 -; GCN-NEXT: v_readlane_b32 s35, v42, 1 -; GCN-NEXT: v_readlane_b32 s34, v42, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v42, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] call amdgpu_gfx void %fptr(i32 inreg 123) ret void } + +define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v41, s33, 6 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v41, s34, 0 +; GCN-NEXT: v_writelane_b32 v41, s35, 1 +; GCN-NEXT: v_writelane_b32 v41, s36, 2 +; GCN-NEXT: v_writelane_b32 v41, s37, 3 +; GCN-NEXT: v_writelane_b32 v41, s30, 4 +; GCN-NEXT: v_writelane_b32 v41, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_cbranch_execnz BB7_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_readlane_b32 s4, v41, 4 +; GCN-NEXT: v_readlane_b32 s5, v41, 5 +; GCN-NEXT: v_readlane_b32 s37, v41, 3 +; GCN-NEXT: v_readlane_b32 s36, v41, 2 +; GCN-NEXT: v_readlane_b32 s35, v41, 1 +; GCN-NEXT: v_readlane_b32 s34, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v41, 6 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void %fptr(i32 %i) + ret i32 %i +} + +; Use a variable inside a waterfall loop and use the return variable after the loop. +; TODO The argument and return variable could be in the same physical register, but the register +; allocator is not able to do that because the return value clashes with the liverange of an +; IMPLICIT_DEF of the argument. +define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 6 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 +; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_cbranch_execnz BB8_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + %ret = call amdgpu_gfx i32 %fptr(i32 %i) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -18,6 +18,8 @@ ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execnz BB0_1 @@ -44,6 +46,8 @@ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr4 ; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz BB0_1 ; GFX9-NEXT: ; %bb.2: @@ -68,6 +72,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz BB0_1 ; GFX8-NEXT: ; %bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -13,19 +13,20 @@ ; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[6:7] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 +; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4