diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4700,9 +4700,13 @@ .addImm(AMDGPU::sub1); Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) - .addReg(CurReg) - .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); + auto Cmp = + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) + .addReg(CurReg); + if (NumSubRegs <= 2) + Cmp.addReg(VRsrc); + else + Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); // Combine the comparision results with AND. if (CondReg == AMDGPU::NoRegister) // First. @@ -4752,13 +4756,20 @@ // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register // with SGPRs by iterating over all unique values across all lanes. static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, - MachineOperand &Rsrc, MachineDominatorTree *MDT) { + MachineOperand &Rsrc, MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin = nullptr, + MachineBasicBlock::iterator End = nullptr) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineBasicBlock::iterator I(&MI); + if (!Begin.isValid()) + Begin = &MI; + if (!End.isValid()) { + End = &MI; + ++End; + } const DebugLoc &DL = MI.getDebugLoc(); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; @@ -4767,13 +4778,17 @@ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask - BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); + BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); // Killed uses in the instruction we are waterfalling around will be // incorrect due to the added control-flow. - for (auto &MO : MI.uses()) { - if (MO.isReg() && MO.isUse()) { - MRI.clearKillFlags(MO.getReg()); + MachineBasicBlock::iterator AfterMI = MI; + ++AfterMI; + for (auto I = Begin; I != AfterMI; I++) { + for (auto &MO : I->uses()) { + if (MO.isReg() && MO.isUse()) { + MRI.clearKillFlags(MO.getReg()); + } } } @@ -4790,11 +4805,11 @@ LoopBB->addSuccessor(LoopBB); LoopBB->addSuccessor(RemainderBB); - // Move MI to the LoopBB, and the remainder of the block to RemainderBB. - MachineBasicBlock::iterator J = I++; + // Move Begin to MI to the LoopBB, and the remainder of the block to + // RemainderBB. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - LoopBB->splice(LoopBB->begin(), &MBB, J); + RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); + LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); MBB.addSuccessor(LoopBB); @@ -5016,6 +5031,34 @@ return; } + // Legalize SI_CALL + if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { + MachineOperand *Dest = &MI.getOperand(0); + if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { + // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and + // following copies, we also need to move copies from and to physical + // registers into the loop block. + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + + // Also move the copies to physical registers into the loop block + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::iterator Start(&MI); + while (Start->getOpcode() != FrameSetupOpcode) + --Start; + MachineBasicBlock::iterator End(&MI); + while (End->getOpcode() != FrameDestroyOpcode) + ++End; + // Also include following copies of the return value + ++End; + while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && + MI.definesRegister(End->getOperand(1).getReg())) + ++End; + loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); + } + } + // Legalize MUBUF* instructions. int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -197,13 +197,275 @@ ret void } -; FIXME -; define void @test_indirect_call_vgpr_ptr(void()* %fptr) { -; call void %fptr() -; ret void -; } +define void @test_indirect_call_vgpr_ptr(void()* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x800 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v43, s34, 0 +; GCN-NEXT: v_writelane_b32 v43, s35, 1 +; GCN-NEXT: v_writelane_b32 v43, s36, 2 +; GCN-NEXT: v_writelane_b32 v43, s38, 3 +; GCN-NEXT: v_writelane_b32 v43, s39, 4 +; GCN-NEXT: v_writelane_b32 v43, s40, 5 +; GCN-NEXT: v_writelane_b32 v43, s41, 6 +; GCN-NEXT: v_writelane_b32 v43, s42, 7 +; GCN-NEXT: v_writelane_b32 v43, s43, 8 +; GCN-NEXT: v_writelane_b32 v43, s44, 9 +; GCN-NEXT: v_writelane_b32 v43, s45, 10 +; GCN-NEXT: v_writelane_b32 v43, s46, 11 +; GCN-NEXT: v_writelane_b32 v43, s47, 12 +; GCN-NEXT: v_writelane_b32 v43, s48, 13 +; GCN-NEXT: v_writelane_b32 v43, s49, 14 +; GCN-NEXT: v_writelane_b32 v43, s30, 15 +; GCN-NEXT: v_writelane_b32 v43, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s35, s13 +; GCN-NEXT: s_mov_b32 s36, s12 +; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] +; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] +; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] +; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s16, v41 +; GCN-NEXT: v_readfirstlane_b32 s17, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] +; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] +; GCN-NEXT: s_mov_b32 s12, s36 +; GCN-NEXT: s_mov_b32 s13, s35 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_cbranch_execnz BB2_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: v_readlane_b32 s4, v43, 15 +; GCN-NEXT: v_readlane_b32 s5, v43, 16 +; GCN-NEXT: v_readlane_b32 s49, v43, 14 +; GCN-NEXT: v_readlane_b32 s48, v43, 13 +; GCN-NEXT: v_readlane_b32 s47, v43, 12 +; GCN-NEXT: v_readlane_b32 s46, v43, 11 +; GCN-NEXT: v_readlane_b32 s45, v43, 10 +; GCN-NEXT: v_readlane_b32 s44, v43, 9 +; GCN-NEXT: v_readlane_b32 s43, v43, 8 +; GCN-NEXT: v_readlane_b32 s42, v43, 7 +; GCN-NEXT: v_readlane_b32 s41, v43, 6 +; GCN-NEXT: v_readlane_b32 s40, v43, 5 +; GCN-NEXT: v_readlane_b32 s39, v43, 4 +; GCN-NEXT: v_readlane_b32 s38, v43, 3 +; GCN-NEXT: v_readlane_b32 s36, v43, 2 +; GCN-NEXT: v_readlane_b32 s35, v43, 1 +; GCN-NEXT: v_readlane_b32 s34, v43, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_sub_u32 s32, s32, 0x800 +; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + call void %fptr() + ret void +} -; define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) { -; call void %fptr(i32 123) -; ret void -; } +define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_arg: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x800 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v43, s34, 0 +; GCN-NEXT: v_writelane_b32 v43, s35, 1 +; GCN-NEXT: v_writelane_b32 v43, s36, 2 +; GCN-NEXT: v_writelane_b32 v43, s38, 3 +; GCN-NEXT: v_writelane_b32 v43, s39, 4 +; GCN-NEXT: v_writelane_b32 v43, s40, 5 +; GCN-NEXT: v_writelane_b32 v43, s41, 6 +; GCN-NEXT: v_writelane_b32 v43, s42, 7 +; GCN-NEXT: v_writelane_b32 v43, s43, 8 +; GCN-NEXT: v_writelane_b32 v43, s44, 9 +; GCN-NEXT: v_writelane_b32 v43, s45, 10 +; GCN-NEXT: v_writelane_b32 v43, s46, 11 +; GCN-NEXT: v_writelane_b32 v43, s47, 12 +; GCN-NEXT: v_writelane_b32 v43, s48, 13 +; GCN-NEXT: v_writelane_b32 v43, s49, 14 +; GCN-NEXT: v_writelane_b32 v43, s30, 15 +; GCN-NEXT: v_writelane_b32 v43, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s35, s13 +; GCN-NEXT: s_mov_b32 s36, s12 +; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] +; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] +; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] +; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s16, v41 +; GCN-NEXT: v_readfirstlane_b32 s17, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: v_mov_b32_e32 v0, 0x7b +; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] +; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] +; GCN-NEXT: s_mov_b32 s12, s36 +; GCN-NEXT: s_mov_b32 s13, s35 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_cbranch_execnz BB3_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: v_readlane_b32 s4, v43, 15 +; GCN-NEXT: v_readlane_b32 s5, v43, 16 +; GCN-NEXT: v_readlane_b32 s49, v43, 14 +; GCN-NEXT: v_readlane_b32 s48, v43, 13 +; GCN-NEXT: v_readlane_b32 s47, v43, 12 +; GCN-NEXT: v_readlane_b32 s46, v43, 11 +; GCN-NEXT: v_readlane_b32 s45, v43, 10 +; GCN-NEXT: v_readlane_b32 s44, v43, 9 +; GCN-NEXT: v_readlane_b32 s43, v43, 8 +; GCN-NEXT: v_readlane_b32 s42, v43, 7 +; GCN-NEXT: v_readlane_b32 s41, v43, 6 +; GCN-NEXT: v_readlane_b32 s40, v43, 5 +; GCN-NEXT: v_readlane_b32 s39, v43, 4 +; GCN-NEXT: v_readlane_b32 s38, v43, 3 +; GCN-NEXT: v_readlane_b32 s36, v43, 2 +; GCN-NEXT: v_readlane_b32 s35, v43, 1 +; GCN-NEXT: v_readlane_b32 s34, v43, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_sub_u32 s32, s32, 0x800 +; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + call void %fptr(i32 123) + ret void +} + +define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_ret: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v43, s33, 17 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x800 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v43, s34, 0 +; GCN-NEXT: v_writelane_b32 v43, s35, 1 +; GCN-NEXT: v_writelane_b32 v43, s36, 2 +; GCN-NEXT: v_writelane_b32 v43, s38, 3 +; GCN-NEXT: v_writelane_b32 v43, s39, 4 +; GCN-NEXT: v_writelane_b32 v43, s40, 5 +; GCN-NEXT: v_writelane_b32 v43, s41, 6 +; GCN-NEXT: v_writelane_b32 v43, s42, 7 +; GCN-NEXT: v_writelane_b32 v43, s43, 8 +; GCN-NEXT: v_writelane_b32 v43, s44, 9 +; GCN-NEXT: v_writelane_b32 v43, s45, 10 +; GCN-NEXT: v_writelane_b32 v43, s46, 11 +; GCN-NEXT: v_writelane_b32 v43, s47, 12 +; GCN-NEXT: v_writelane_b32 v43, s48, 13 +; GCN-NEXT: v_writelane_b32 v43, s49, 14 +; GCN-NEXT: v_writelane_b32 v43, s30, 15 +; GCN-NEXT: v_writelane_b32 v43, s31, 16 +; GCN-NEXT: v_mov_b32_e32 v40, v31 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s35, s13 +; GCN-NEXT: s_mov_b32 s36, s12 +; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] +; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] +; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] +; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v42, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s16, v41 +; GCN-NEXT: v_readfirstlane_b32 s17, v42 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42] +; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] +; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] +; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] +; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] +; GCN-NEXT: s_mov_b32 s12, s36 +; GCN-NEXT: s_mov_b32 s13, s35 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: v_mov_b32_e32 v31, v40 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_cbranch_execnz BB4_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GCN-NEXT: v_readlane_b32 s4, v43, 15 +; GCN-NEXT: v_readlane_b32 s5, v43, 16 +; GCN-NEXT: v_readlane_b32 s49, v43, 14 +; GCN-NEXT: v_readlane_b32 s48, v43, 13 +; GCN-NEXT: v_readlane_b32 s47, v43, 12 +; GCN-NEXT: v_readlane_b32 s46, v43, 11 +; GCN-NEXT: v_readlane_b32 s45, v43, 10 +; GCN-NEXT: v_readlane_b32 s44, v43, 9 +; GCN-NEXT: v_readlane_b32 s43, v43, 8 +; GCN-NEXT: v_readlane_b32 s42, v43, 7 +; GCN-NEXT: v_readlane_b32 s41, v43, 6 +; GCN-NEXT: v_readlane_b32 s40, v43, 5 +; GCN-NEXT: v_readlane_b32 s39, v43, 4 +; GCN-NEXT: v_readlane_b32 s38, v43, 3 +; GCN-NEXT: v_readlane_b32 s36, v43, 2 +; GCN-NEXT: v_readlane_b32 s35, v43, 1 +; GCN-NEXT: v_readlane_b32 s34, v43, 0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_sub_u32 s32, s32, 0x800 +; GCN-NEXT: v_readlane_b32 s33, v43, 17 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + %a = call i32 %fptr() + %b = add i32 %a, 1 + ret i32 %b +}