diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5258,7 +5258,7 @@ .addReg(Exec) .addReg(SaveExec); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -300,6 +300,14 @@ let hasSideEffects = 1; } +def SI_WATERFALL_LOOP : CFPseudoInstSI < + (outs), + (ins brtarget:$target), [], 1> { + let Size = 8; + let isBranch = 1; + let Defs = []; +} + def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_1:$saved, brtarget:$target), [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -91,6 +91,7 @@ void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); + void emitWaterfallLoop(MachineInstr &MI); MachineBasicBlock *emitEndCf(MachineInstr &MI); @@ -418,6 +419,22 @@ MI.eraseFromParent(); } +void SILowerControlFlow::emitWaterfallLoop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstr *Branch = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .add(MI.getOperand(0)); + + if (LIS) { + LIS->ReplaceMachineInstrInMaps(MI, *Branch); + LIS->InsertMachineInstrInMaps(*Branch); + } + + MI.eraseFromParent(); +} + MachineBasicBlock::iterator SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { @@ -600,6 +617,10 @@ emitLoop(MI); break; + case AMDGPU::SI_WATERFALL_LOOP: + emitWaterfallLoop(MI); + break; + case AMDGPU::SI_END_CF: SplitBB = emitEndCf(MI); break; @@ -843,6 +864,7 @@ case AMDGPU::SI_ELSE: case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_WATERFALL_LOOP: case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: // Only build worklist if SI_IF instructions must be processed first. diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -171,6 +171,7 @@ if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP || MI.getOpcode() == AMDGPU::SI_LOOP) { Divergent = true; break; diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -89,6 +89,10 @@ SmallSetVector &ElseBlocks, SmallVectorImpl &CandidateRegs) const; + void collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector &CandidateRegs) const; + void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, SmallVectorImpl &Uses) const; @@ -105,6 +109,8 @@ MachineBasicBlock *Flow, MachineBasicBlock *Endif, SmallSetVector &ElseBlocks) const; + void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const; + SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -278,6 +284,38 @@ } } +/// Collect the registers used in the waterfall loop block that are defined +/// before. +void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( + MachineBasicBlock *Loop, + SmallSetVector &CandidateRegs) const { + + for (auto &MI : Loop->instrs()) { + if (MI.isDebugInstr()) + continue; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg() || MO.isDef()) + continue; + + Register MOReg = MO.getReg(); + // We can only optimize AGPR/VGPR virtual register + if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg)) + continue; + + if (MO.readsReg()) { + const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + // Make sure the value is defined before the LOOP block + if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) { + LLVM_DEBUG(dbgs() << "Found candidate reg: " + << printReg(MOReg, TRI, 0, MRI) << '\n'); + CandidateRegs.insert(MOReg); + } + } + } + } +} + // Re-calculate the liveness of \p Reg in the THEN-region void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { @@ -403,12 +441,8 @@ } // Replace all uses in the ELSE region or the PHIs in ENDIF block - for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) { - MachineOperand &O = *I; - // This is a little bit tricky, the setReg() will update the linked list, - // so we have to increment the iterator before setReg() to avoid skipping - // some uses. - ++I; + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { auto *UseMI = O.getParent(); auto *UseBlock = UseMI->getParent(); // Replace uses in Endif block @@ -431,6 +465,54 @@ updateLiveRangeInThenRegion(Reg, If, Flow); } +void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( + Register Reg, MachineBasicBlock *Loop) const { + // Insert a new PHI, marking the value from the last loop iteration undef. + LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); + const auto *RC = MRI->getRegClass(Reg); + Register NewReg = MRI->createVirtualRegister(RC); + Register UndefReg = MRI->createVirtualRegister(RC); + MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : Loop->predecessors()) { + if (Pred == Loop) + PHI.addReg(UndefReg, RegState::Undef | RegState::Kill).addMBB(Pred); + else + PHI.addReg(Reg).addMBB(Pred); + } + + // Replace all uses in the LOOP region + // Use early increment range because setReg() will update the linked list. + for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { + auto *UseMI = O.getParent(); + auto *UseBlock = UseMI->getParent(); + // Replace uses in Loop block + if (UseBlock == Loop && UseMI != PHI.getInstr()) + O.setReg(NewReg); + } + + LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); + LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); + LiveVariables::VarInfo &UndefVarInfo = LV->getVarInfo(UndefReg); + + // The optimized Reg is not alive through the Loop block anymore. + OldVarInfo.AliveBlocks.reset(Loop->getNumber()); + NewVarInfo.AliveBlocks.set(Loop->getNumber()); + + UndefVarInfo.Kills.push_back(PHI); + + // Transfer the possible Kills in the Loop from Reg to NewReg + auto I = OldVarInfo.Kills.begin(); + while (I != OldVarInfo.Kills.end()) { + if ((*I)->getParent() == Loop) { + NewVarInfo.Kills.push_back(*I); + I = OldVarInfo.Kills.erase(I); + } else { + ++I; + } + } +} + char SIOptimizeVGPRLiveRange::ID = 0; INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, @@ -491,6 +573,16 @@ // Now we are safe to optimize. for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); + } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { + LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " + << printMBBReference(MBB) << '\n'); + + SmallSetVector CandidateRegs; + collectWaterfallCandidateRegisters(&MBB, CandidateRegs); + MadeChange |= !CandidateRegs.empty(); + // Now we are safe to optimize. + for (auto Reg : CandidateRegs) + optimizeWaterfallLiveRange(Reg, &MBB); } } } diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll --- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll @@ -24,6 +24,8 @@ ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] ; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]] define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) { @@ -48,6 +50,8 @@ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] ; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]] define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) { diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -202,32 +202,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -235,13 +231,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -250,36 +244,34 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB2_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -292,32 +284,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -325,13 +313,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] @@ -341,36 +327,34 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB3_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -383,32 +367,28 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: v_writelane_b32 v40, s33, 17 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s30, 15 -; GCN-NEXT: v_writelane_b32 v43, s31, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s30, 15 +; GCN-NEXT: v_writelane_b32 v40, s31, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -416,13 +396,11 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -431,37 +409,36 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] ; GCN-NEXT: s_cbranch_execnz BB4_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_readlane_b32 s4, v43, 15 -; GCN-NEXT: v_readlane_b32 s5, v43, 16 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN-NEXT: v_readlane_b32 s4, v40, 15 +; GCN-NEXT: v_readlane_b32 s5, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 17 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -475,32 +452,28 @@ ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: v_writelane_b32 v43, s33, 19 +; GCN-NEXT: v_writelane_b32 v40, s33, 19 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v43, s34, 0 -; GCN-NEXT: v_writelane_b32 v43, s35, 1 -; GCN-NEXT: v_writelane_b32 v43, s36, 2 -; GCN-NEXT: v_writelane_b32 v43, s38, 3 -; GCN-NEXT: v_writelane_b32 v43, s39, 4 -; GCN-NEXT: v_writelane_b32 v43, s40, 5 -; GCN-NEXT: v_writelane_b32 v43, s41, 6 -; GCN-NEXT: v_writelane_b32 v43, s42, 7 -; GCN-NEXT: v_writelane_b32 v43, s43, 8 -; GCN-NEXT: v_writelane_b32 v43, s44, 9 -; GCN-NEXT: v_writelane_b32 v43, s45, 10 -; GCN-NEXT: v_writelane_b32 v43, s46, 11 -; GCN-NEXT: v_writelane_b32 v43, s47, 12 -; GCN-NEXT: v_writelane_b32 v43, s48, 13 -; GCN-NEXT: v_writelane_b32 v43, s49, 14 -; GCN-NEXT: v_writelane_b32 v43, s50, 15 -; GCN-NEXT: v_writelane_b32 v43, s51, 16 -; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s38, 3 +; GCN-NEXT: v_writelane_b32 v40, s39, 4 +; GCN-NEXT: v_writelane_b32 v40, s40, 5 +; GCN-NEXT: v_writelane_b32 v40, s41, 6 +; GCN-NEXT: v_writelane_b32 v40, s42, 7 +; GCN-NEXT: v_writelane_b32 v40, s43, 8 +; GCN-NEXT: v_writelane_b32 v40, s44, 9 +; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s46, 11 +; GCN-NEXT: v_writelane_b32 v40, s47, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_writelane_b32 v40, s50, 15 +; GCN-NEXT: v_writelane_b32 v40, s51, 16 ; GCN-NEXT: s_mov_b32 s34, s14 ; GCN-NEXT: s_mov_b32 s35, s13 ; GCN-NEXT: s_mov_b32 s36, s12 @@ -508,20 +481,18 @@ ; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] ; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] ; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_cbranch_execz BB5_4 ; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: v_writelane_b32 v43, s30, 17 -; GCN-NEXT: v_writelane_b32 v43, s31, 18 +; GCN-NEXT: v_writelane_b32 v40, s30, 17 +; GCN-NEXT: v_writelane_b32 v40, s31, 18 ; GCN-NEXT: s_mov_b64 s[48:49], exec ; GCN-NEXT: BB5_2: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v41 -; GCN-NEXT: v_readfirstlane_b32 s17, v42 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] ; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] @@ -530,40 +501,38 @@ ; GCN-NEXT: s_mov_b32 s12, s36 ; GCN-NEXT: s_mov_b32 s13, s35 ; GCN-NEXT: s_mov_b32 s14, s34 -; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] ; GCN-NEXT: s_cbranch_execnz BB5_2 ; GCN-NEXT: ; %bb.3: ; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: v_readlane_b32 s30, v43, 17 -; GCN-NEXT: v_readlane_b32 s31, v43, 18 +; GCN-NEXT: v_readlane_b32 s30, v40, 17 +; GCN-NEXT: v_readlane_b32 s31, v40, 18 ; GCN-NEXT: BB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v43, 16 -; GCN-NEXT: v_readlane_b32 s50, v43, 15 -; GCN-NEXT: v_readlane_b32 s49, v43, 14 -; GCN-NEXT: v_readlane_b32 s48, v43, 13 -; GCN-NEXT: v_readlane_b32 s47, v43, 12 -; GCN-NEXT: v_readlane_b32 s46, v43, 11 -; GCN-NEXT: v_readlane_b32 s45, v43, 10 -; GCN-NEXT: v_readlane_b32 s44, v43, 9 -; GCN-NEXT: v_readlane_b32 s43, v43, 8 -; GCN-NEXT: v_readlane_b32 s42, v43, 7 -; GCN-NEXT: v_readlane_b32 s41, v43, 6 -; GCN-NEXT: v_readlane_b32 s40, v43, 5 -; GCN-NEXT: v_readlane_b32 s39, v43, 4 -; GCN-NEXT: v_readlane_b32 s38, v43, 3 -; GCN-NEXT: v_readlane_b32 s36, v43, 2 -; GCN-NEXT: v_readlane_b32 s35, v43, 1 -; GCN-NEXT: v_readlane_b32 s34, v43, 0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: v_readlane_b32 s33, v43, 19 +; GCN-NEXT: v_readlane_b32 s51, v40, 16 +; GCN-NEXT: v_readlane_b32 s50, v40, 15 +; GCN-NEXT: v_readlane_b32 s49, v40, 14 +; GCN-NEXT: v_readlane_b32 s48, v40, 13 +; GCN-NEXT: v_readlane_b32 s47, v40, 12 +; GCN-NEXT: v_readlane_b32 s46, v40, 11 +; GCN-NEXT: v_readlane_b32 s45, v40, 10 +; GCN-NEXT: v_readlane_b32 s44, v40, 9 +; GCN-NEXT: v_readlane_b32 s43, v40, 8 +; GCN-NEXT: v_readlane_b32 s42, v40, 7 +; GCN-NEXT: v_readlane_b32 s41, v40, 6 +; GCN-NEXT: v_readlane_b32 s40, v40, 5 +; GCN-NEXT: v_readlane_b32 s39, v40, 4 +; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v40, 19 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -583,48 +552,93 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v42, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 6 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v42, s34, 0 -; GCN-NEXT: v_writelane_b32 v42, s35, 1 -; GCN-NEXT: v_writelane_b32 v42, s36, 2 -; GCN-NEXT: v_writelane_b32 v42, s37, 3 -; GCN-NEXT: v_writelane_b32 v42, s30, 4 -; GCN-NEXT: v_writelane_b32 v42, s31, 5 -; GCN-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s30, 4 +; GCN-NEXT: v_writelane_b32 v40, s31, 5 ; GCN-NEXT: s_mov_b64 s[34:35], exec ; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v40 -; GCN-NEXT: v_readfirstlane_b32 s7, v41 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41] +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc ; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_readlane_b32 s4, v42, 4 -; GCN-NEXT: v_readlane_b32 s5, v42, 5 -; GCN-NEXT: v_readlane_b32 s37, v42, 3 -; GCN-NEXT: v_readlane_b32 s36, v42, 2 -; GCN-NEXT: v_readlane_b32 s35, v42, 1 -; GCN-NEXT: v_readlane_b32 s34, v42, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v42, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 6 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] call amdgpu_gfx void %fptr(i32 inreg 123) ret void } + +define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v41, s33, 6 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v41, s34, 0 +; GCN-NEXT: v_writelane_b32 v41, s35, 1 +; GCN-NEXT: v_writelane_b32 v41, s36, 2 +; GCN-NEXT: v_writelane_b32 v41, s37, 3 +; GCN-NEXT: v_writelane_b32 v41, s30, 4 +; GCN-NEXT: v_writelane_b32 v41, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_cbranch_execnz BB7_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_readlane_b32 s4, v41, 4 +; GCN-NEXT: v_readlane_b32 s5, v41, 5 +; GCN-NEXT: v_readlane_b32 s37, v41, 3 +; GCN-NEXT: v_readlane_b32 s36, v41, 2 +; GCN-NEXT: v_readlane_b32 s35, v41, 1 +; GCN-NEXT: v_readlane_b32 s34, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v41, 6 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void %fptr(i32 %i) + ret i32 %i +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -18,6 +18,8 @@ ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execnz BB0_1 @@ -44,6 +46,8 @@ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr4 ; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz BB0_1 ; GFX9-NEXT: ; %bb.2: @@ -68,6 +72,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_cbranch_execnz BB0_1 ; GFX8-NEXT: ; %bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -30,7 +30,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -55,7 +55,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -103,7 +103,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -128,7 +128,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -176,7 +176,7 @@ # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-LABEL: bb.2: # W64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -201,7 +201,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] --- @@ -286,7 +286,7 @@ # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec # W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc -# W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W64-NO-ADDR64: SI_WATERFALL_LOOP %bb.1, implicit $exec # W64-NO-ADDR64-LABEL: bb.2: # W64-NO-ADDR64: $exec = S_MOV_B64 [[SAVEEXEC]] @@ -309,7 +309,7 @@ # W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] -# W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec +# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec # W32-LABEL: bb.2: # W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -13,19 +13,20 @@ ; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[6:7] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-NEXT: v_readfirstlane_b32 s11, v3 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GCN-NEXT: v_readfirstlane_b32 s8, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 +; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4