diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -122,7 +122,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; - void processPHINode(MachineInstr &MI); + MachineBasicBlock *processPHINode(MachineInstr &MI); StringRef getPassName() const override { return "SI Fix SGPR copies"; } @@ -594,9 +594,9 @@ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { + MachineBasicBlock *MBB = &*BI; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { MachineInstr &MI = *I; switch (MI.getOpcode()) { @@ -620,9 +620,9 @@ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(MBB, MI, MI.getDebugLoc(), + BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) - .add(MI.getOperand(1)); + .add(MI.getOperand(1)); MI.getOperand(1).setReg(TmpReg); } @@ -632,7 +632,15 @@ if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { Register SrcReg = MI.getOperand(1).getReg(); if (!SrcReg.isVirtual()) { - TII->moveToVALU(MI, MDT); + MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); + if (NewBB && NewBB != MBB) { + MBB = NewBB; + E = MBB->end(); + BI = MachineFunction::iterator(MBB); + BE = MF.end(); + } + assert((!NewBB || NewBB == I->getParent()) && + "moveToVALU did not return the right basic block"); break; } @@ -647,7 +655,15 @@ MI.setDesc(TII->get(SMovOp)); break; } - TII->moveToVALU(MI, MDT); + MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); + if (NewBB && NewBB != MBB) { + MBB = NewBB; + E = MBB->end(); + BI = MachineFunction::iterator(MBB); + BE = MF.end(); + } + assert((!NewBB || NewBB == I->getParent()) && + "moveToVALU did not return the right basic block"); } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -655,10 +671,18 @@ break; } case AMDGPU::PHI: { - processPHINode(MI); + MachineBasicBlock *NewBB = processPHINode(MI); + if (NewBB && NewBB != MBB) { + MBB = NewBB; + E = MBB->end(); + BI = MachineFunction::iterator(MBB); + BE = MF.end(); + } + assert((!NewBB || NewBB == I->getParent()) && + "moveToVALU did not return the right basic block"); break; } - case AMDGPU::REG_SEQUENCE: + case AMDGPU::REG_SEQUENCE: { if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || !hasVectorOperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); @@ -667,8 +691,17 @@ LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - TII->moveToVALU(MI, MDT); + MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); + if (NewBB && NewBB != MBB) { + MBB = NewBB; + E = MBB->end(); + BI = MachineFunction::iterator(MBB); + BE = MF.end(); + } + assert((!NewBB || NewBB == I->getParent()) && + "moveToVALU did not return the right basic block"); break; + } case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); @@ -678,7 +711,15 @@ (TRI->hasVectorRegisters(Src0RC) || TRI->hasVectorRegisters(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - TII->moveToVALU(MI, MDT); + MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); + if (NewBB && NewBB != MBB) { + MBB = NewBB; + E = MBB->end(); + BI = MachineFunction::iterator(MBB); + BE = MF.end(); + } + assert((!NewBB || NewBB == I->getParent()) && + "moveToVALU did not return the right basic block"); } break; } @@ -753,12 +794,13 @@ return true; } -void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { +MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { unsigned numVGPRUses = 0; bool AllAGPRUses = true; SetVector worklist; SmallSet Visited; SetVector PHIOperands; + MachineBasicBlock *CreatedBB = nullptr; worklist.insert(&MI); Visited.insert(&MI); while (!worklist.empty()) { @@ -850,7 +892,7 @@ RC0 != &AMDGPU::VReg_1RegClass) && (hasVGPRInput || numVGPRUses > 1)) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI); + CreatedBB = TII->moveToVALU(MI); } else { LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); @@ -861,4 +903,5 @@ while (!PHIOperands.empty()) { processPHINode(*PHIOperands.pop_back_val()); } + return CreatedBB; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -81,8 +81,9 @@ private: void swapOperands(MachineInstr &Inst) const; - bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + std::pair + moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; @@ -906,13 +907,15 @@ /// Legalize all operands in this instruction. This function may create new /// instructions and control-flow around \p MI. If present, \p MDT is /// updated. - void legalizeOperands(MachineInstr &MI, - MachineDominatorTree *MDT = nullptr) const; + /// \returns A new basic block that contains \p MI if new blocks were created. + MachineBasicBlock * + legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const; /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. If present, \p MDT is updated. - void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const; + MachineBasicBlock *moveToVALU(MachineInstr &MI, + MachineDominatorTree *MDT = nullptr) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4826,10 +4826,12 @@ // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register // with SGPRs by iterating over all unique values across all lanes. -static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, - MachineOperand &Rsrc, MachineDominatorTree *MDT, - MachineBasicBlock::iterator Begin = nullptr, - MachineBasicBlock::iterator End = nullptr) { +// Returns the loop basic block that now contains \p MI. +static MachineBasicBlock * +loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, + MachineOperand &Rsrc, MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin = nullptr, + MachineBasicBlock::iterator End = nullptr) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -4903,6 +4905,7 @@ // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); + return LoopBB; } // Extract pointer from Rsrc and return a zero-value Rsrc replacement. @@ -4948,33 +4951,35 @@ return std::make_tuple(RsrcPtr, NewSRsrc); } -void SIInstrInfo::legalizeOperands(MachineInstr &MI, - MachineDominatorTree *MDT) const { +MachineBasicBlock * +SIInstrInfo::legalizeOperands(MachineInstr &MI, + MachineDominatorTree *MDT) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock *CreatedBB = nullptr; // Legalize VOP2 if (isVOP2(MI) || isVOPC(MI)) { legalizeOperandsVOP2(MRI, MI); - return; + return CreatedBB; } // Legalize VOP3 if (isVOP3(MI)) { legalizeOperandsVOP3(MRI, MI); - return; + return CreatedBB; } // Legalize SMRD if (isSMRD(MI)) { legalizeOperandsSMRD(MRI, MI); - return; + return CreatedBB; } // Legalize FLAT if (isFLAT(MI)) { legalizeOperandsFLAT(MRI, MI); - return; + return CreatedBB; } // Legalize REG_SEQUENCE and PHI @@ -5057,7 +5062,7 @@ } } - return; + return CreatedBB; } // Legalize INSERT_SUBREG @@ -5072,7 +5077,7 @@ MachineOperand &Op = MI.getOperand(1); legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); } - return; + return CreatedBB; } // Legalize SI_INIT_M0 @@ -5080,7 +5085,7 @@ MachineOperand &Src = MI.getOperand(0); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); - return; + return CreatedBB; } // Legalize MIMG and MUBUF/MTBUF for shaders. @@ -5093,13 +5098,13 @@ (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) - loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); + CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) - loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); + CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); - return; + return CreatedBB; } // Legalize SI_CALL @@ -5125,7 +5130,7 @@ while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && MI.definesRegister(End->getOperand(1).getReg())) ++End; - loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); + CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); } } @@ -5140,7 +5145,7 @@ RI.getRegClass(RsrcRC))) { // The operands are legal. // FIXME: We may need to legalize operands besided srsrc. - return; + return CreatedBB; } // Legalize a VGPR Rsrc. @@ -5274,15 +5279,19 @@ } else { // This is another variant; legalize Rsrc with waterfall loop from VGPRs // to SGPRs. - loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); + CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); + return CreatedBB; } } + return CreatedBB; } -void SIInstrInfo::moveToVALU(MachineInstr &TopInst, - MachineDominatorTree *MDT) const { +MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, + MachineDominatorTree *MDT) const { SetVectorType Worklist; Worklist.insert(&TopInst); + MachineBasicBlock *CreatedBB = nullptr; + MachineBasicBlock *CreatedBBTmp = nullptr; while (!Worklist.empty()) { MachineInstr &Inst = *Worklist.pop_back_val(); @@ -5302,13 +5311,18 @@ Inst.eraseFromParent(); continue; case AMDGPU::S_ADD_I32: - case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_I32: { // FIXME: The u32 versions currently selected use the carry. - if (moveScalarAddSub(Worklist, Inst, MDT)) + bool Changed; + std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); + if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) + CreatedBB = CreatedBBTmp; + if (Changed) continue; // Default handling break; + } case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); @@ -5489,7 +5503,9 @@ .add(Inst.getOperand(3)) .addReg(CarryInReg) .addImm(0); - legalizeOperands(*CarryOp); + CreatedBBTmp = legalizeOperands(*CarryOp); + if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) + CreatedBB = CreatedBBTmp; MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); @@ -5515,7 +5531,9 @@ .add(Src1) .addImm(0); // clamp bit - legalizeOperands(*NewInstr, MDT); + CreatedBBTmp = legalizeOperands(*NewInstr, MDT); + if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) + CreatedBB = CreatedBBTmp; MRI.replaceRegWith(Dest0.getReg(), DestReg); addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, @@ -5534,7 +5552,9 @@ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. - legalizeOperands(Inst, MDT); + CreatedBBTmp = legalizeOperands(Inst, MDT); + if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) + CreatedBB = CreatedBBTmp; continue; } @@ -5625,16 +5645,20 @@ } // Legalize the operands - legalizeOperands(Inst, MDT); + CreatedBBTmp = legalizeOperands(Inst, MDT); + if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) + CreatedBB = CreatedBBTmp; if (HasDst) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } + return CreatedBB; } // Add/sub require special handling to deal with carry outs. -bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { +std::pair +SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. // Since scc isn't used, it doesn't really matter if the i32 or u32 variant @@ -5659,13 +5683,13 @@ Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); - legalizeOperands(Inst, MDT); + MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); - return true; + return std::make_pair(true, NewBB); } - return false; + return std::make_pair(false, nullptr); } void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -469,3 +469,111 @@ %b = add i32 %a, 1 ret i32 %b } + +define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch: +; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v43, s33, 19 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x800 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v43, s34, 0 +; GCN-NEXT: v_writelane_b32 v43, s35, 1 +; GCN-NEXT: v_writelane_b32 v43, s36, 2 +; GCN-NEXT: v_writelane_b32 v43, s38, 3 +; GCN-NEXT: v_writelane_b32 v43, s39, 4 +; GCN-NEXT: v_writelane_b32 v43, s40, 5 +; GCN-NEXT: v_writelane_b32 v43, s41, 6 +; GCN-NEXT: v_writelane_b32 v43, s42, 7 +; GCN-NEXT: v_writelane_b32 v43, s43, 8 +; GCN-NEXT: v_writelane_b32 v43, s44, 9 +; GCN-NEXT: v_writelane_b32 v43, s45, 10 +; GCN-NEXT: v_writelane_b32 v43, s46, 11 +; GCN-NEXT: v_writelane_b32 v43, s47, 12 +; GCN-NEXT: v_writelane_b32 v43, s48, 13 +; GCN-NEXT: v_writelane_b32 v43, s49, 14 +; GCN-NEXT: v_writelane_b32 v43, s50, 15 +; GCN-NEXT: v_writelane_b32 v43, s51, 16 +; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s35, s13 +; GCN-NEXT: s_mov_b32 s36, s12 +; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] +; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] +; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] +; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc +; GCN-NEXT: s_cbranch_execz BB5_4 +; GCN-NEXT: ; %bb.1: ; %bb1 +; GCN-NEXT: v_writelane_b32 v43, s30, 17 +; GCN-NEXT: v_writelane_b32 v43, s31, 18 +; GCN-NEXT: s_mov_b64 s[48:49], exec +; GCN-NEXT: BB5_2: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s16, v41 +; GCN-NEXT: v_readfirstlane_b32 s17, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] +; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] +; GCN-NEXT: s_mov_b32 s12, s36 +; GCN-NEXT: s_mov_b32 s13, s35 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] +; GCN-NEXT: s_cbranch_execnz BB5_2 +; GCN-NEXT: ; %bb.3: +; GCN-NEXT: s_mov_b64 exec, s[48:49] +; GCN-NEXT: v_readlane_b32 s30, v43, 17 +; GCN-NEXT: v_readlane_b32 s31, v43, 18 +; GCN-NEXT: BB5_4: ; %bb2 +; GCN-NEXT: s_or_b64 exec, exec, s[46:47] +; GCN-NEXT: v_readlane_b32 s51, v43, 16 +; GCN-NEXT: v_readlane_b32 s50, v43, 15 +; GCN-NEXT: v_readlane_b32 s49, v43, 14 +; GCN-NEXT: v_readlane_b32 s48, v43, 13 +; GCN-NEXT: v_readlane_b32 s47, v43, 12 +; GCN-NEXT: v_readlane_b32 s46, v43, 11 +; GCN-NEXT: v_readlane_b32 s45, v43, 10 +; GCN-NEXT: v_readlane_b32 s44, v43, 9 +; GCN-NEXT: v_readlane_b32 s43, v43, 8 +; GCN-NEXT: v_readlane_b32 s42, v43, 7 +; GCN-NEXT: v_readlane_b32 s41, v43, 6 +; GCN-NEXT: v_readlane_b32 s40, v43, 5 +; GCN-NEXT: v_readlane_b32 s39, v43, 4 +; GCN-NEXT: v_readlane_b32 s38, v43, 3 +; GCN-NEXT: v_readlane_b32 s36, v43, 2 +; GCN-NEXT: v_readlane_b32 s35, v43, 1 +; GCN-NEXT: v_readlane_b32 s34, v43, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_sub_u32 s32, s32, 0x800 +; GCN-NEXT: v_readlane_b32 s33, v43, 19 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +bb0: + br i1 %cond, label %bb1, label %bb2 + +bb1: + call void %fptr() + br label %bb2 + +bb2: + ret void +}