Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -101,7 +101,8 @@ bool NeedToBeConvertedToVALU = false; // Unique ID. Used as a key for mapping to keep permanent order. unsigned ID; - + // Flag to check if use of copy result need to be sgpr + bool CopyResultUseNeedToBeSgpr = false; // Count of another VGPR to SGPR copies that contribute to the // current copy SChain unsigned SiblingPenalty = 0; @@ -909,9 +910,29 @@ } } else if (Inst->getNumExplicitDefs() != 0) { Register Reg = Inst->getOperand(0).getReg(); - if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) - for (auto &U : MRI->use_instructions(Reg)) + for (auto &U : MRI->use_instructions(Reg)) { + if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) Users.push_back(&U); + + if (Inst->isCopy()) { + unsigned Opc = U.getOpcode(); + // If MUBUF or MTBUF, soffset and srsrc operands + // need to be scalar registers. + if (TII->isMUBUF(Opc) || TII->isMTBUF(Opc)) { + const MachineOperand *soffsetMO = + TII->getNamedOperand(U, AMDGPU::OpName::soffset); + + const MachineOperand *srsrcMO = + TII->getNamedOperand(U, AMDGPU::OpName::srsrc); + + // set CopyResultUseNeedToBeSgpr to true if use of result of copy + // is in MUBUF/MTBUF soffset or srsrc. + if ((soffsetMO && Reg == soffsetMO->getReg()) || + (srsrcMO && Reg == srsrcMO->getReg())) + Info.CopyResultUseNeedToBeSgpr = true; + } + } + } } for (auto U : Users) { if (TII->isSALU(*U)) @@ -925,6 +946,12 @@ // The main function that computes the VGPR to SGPR copy score // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { + + if (Info->CopyResultUseNeedToBeSgpr) { + Info->Score = 0; + return false; + } + if (Info->SChain.empty()) { Info->Score = 0; return true; Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -134,9 +134,10 @@ ; W64-LABEL: mubuf_vgpr_outside_entry -; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} +; W64: ; %bb.{{[0-9]+}}: +; W64: s_mov_b32 s{{[0-9]+}}, 17 ; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - +; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} ; W64: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] @@ -150,36 +151,29 @@ ; W64: s_xor_b64 exec, exec, [[SAVE]] ; W64: s_cbranch_execnz [[LOOPBB0]] -; W64: s_mov_b64 exec, [[SAVEEXEC]] -; W64: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - ; W64: ; %bb.{{[0-9]+}}: -; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} -; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[SAVE]] -; W64: s_cbranch_execnz [[LOOPBB1]] - -; W64: s_mov_b64 exec, [[SAVEEXEC]] - -; W64: [[TERMBB]]: -; W64: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off +; W64: s_mov_b64 exec, [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]] +; W64: v_and_b32_e32 v[[VRSRC0:[0-9]+]], 0x3ff, v[[VRSRC31:[0-9]+]] +; W64: s_cbranch_execz [[LOOPBB4:.LBB[0-9]+_[0-9]+]] +; W64: ; %bb.{{[0-9]+}}: +; W64: v_mov_b32_e32 v[[VRSRC0]], s{{[0-9]+}} +; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64: buffer_load_format_x [[RES]], v[[VRSRC0]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen + +; W64: [[LOOPBB4]]: +; W64: s_or_b64 exec, exec, [[CMP0]] +; W64-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off +; W64: s_setpc_b64 s{{\[[0-9]+:[0-9]+\]}} ; W32-LABEL: mubuf_vgpr_outside_entry - -; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 -; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo +; W32: ; %bb.{{[0-9]+}}: +; W32: s_mov_b32 s{{[0-9]+}}, 17 +; W32: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} +; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo ; W32: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] @@ -194,118 +188,162 @@ ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] ; W32: s_cbranch_execnz [[LOOPBB0]] -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] -; W32: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - ; W32: ; %bb.{{[0-9]+}}: -; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 -; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo - -; W32: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB1]] - ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] +; W32: v_and_b32_e32 v[[VRSRC0]], 0x3ff, v[[VRSRC31:[0-9]+]] +; W32: s_cbranch_execz [[LOOPBB4:.LBB[0-9]+_[0-9]+]] -; W32: [[TERMBB]]: -; W32: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off - +; W32: ; %bb.{{[0-9]+}}: +; W32: v_mov_b32_e32 v[[VRSRC0]], s{{[0-9]+}} +; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W32: buffer_load_format_x [[RES]], v[[VRSRC0]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen + +; W32: [[LOOPBB4]]: +; W32: s_or_b32 exec_lo, exec_lo, [[CMP0]] +; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off +; W32: s_setpc_b64 s{{\[[0-9]+:[0-9]+\]}} ; Confirm spills do not occur between the XOR and branch that terminate the ; waterfall loop BBs. ; W64-O0-LABEL: mubuf_vgpr_outside_entry - -; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}} -; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}} -; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload -; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_xor_b64 exec, exec, [[SAVE]] -; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] - -; XXX-W64-O0: s_mov_b64 exec, [[SAVEEXEC]] -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - -; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 -; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill -; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec -; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] -; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] - -; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload -; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_xor_b64 exec, exec, [[SAVE]] -; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] - -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] -; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] -; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill - -; W64-O0: [[TERMBB]]: -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Reload -; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off +; W64-O0: ; %bb.{{[0-9]+}}: +; W64-O0: s_or_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], -1 +; W64-O0: buffer_store_dword v[[VRSRC8:[0-9]+]], off, [[STOREDWORD:s\[[0-9]+:[0-9]+\]]], s[[SRSRC32:[0-9]+]] +; W64-O0: s_mov_b64 exec, [[SAVEEXEC]] +; W64-O0: buffer_store_dword v[[VRSRC31:[0-9]+]], off, [[STOREDWORD]], s[[SRSRC32]] offset:56 +; W64-O0: buffer_store_dword v[[VRSRC11:[0-9]+]], off, [[STOREDWORD]], s[[SRSRC32]] offset:52 +; W64-O0: v_mov_b32_e32 v[[VRSRC9:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC10:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC11]], v[[VRSRC5:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC4:[0-9]+]] +; W64-O0: buffer_load_dword v[[VRSRC4]], off, [[STOREDWORD]], s[[SRSRC32]] offset:52 +; W64-O0: buffer_store_dword v[[VRSRC5]], off, [[STOREDWORD]], s[[SRSRC32]] offset:48 +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC3:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC6]], v[[VRSRC2:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC7]], v[[VRSRC1:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC13:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W64-O0: buffer_load_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:48 +; W64-O0: v_mov_b32_e32 v[[VRSRC1]], v[[VRSRC11]] +; W64-O0: v_mov_b32_e32 v[[VRSRC2]], v[[VRSRC10]] +; W64-O0: v_mov_b32_e32 v[[VRSRC3]], v[[VRSRC9]] +; W64-O0: v_mov_b32_e32 v[[VRSRC14:[0-9]+]], v[[VRSRC7]] +; W64-O0: v_mov_b32_e32 v[[VRSRC15:[0-9]+]], v[[VRSRC6]] +; W64-O0: v_mov_b32_e32 v[[VRSRC16:[0-9]+]], v[[VRSRC5]] +; W64-O0: buffer_store_dword v[[VRSRC13]], off, [[STOREDWORD]], s[[SRSRC32]] offset:32 +; W64-O0: buffer_store_dword v[[VRSRC14]], off, [[STOREDWORD]], s[[SRSRC32]] offset:36 +; W64-O0: buffer_store_dword v[[VRSRC15]], off, [[STOREDWORD]], s[[SRSRC32]] offset:40 +; W64-O0: buffer_store_dword v[[VRSRC16]], off, [[STOREDWORD]], s[[SRSRC32]] offset:44 +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC12:[0-9]+]] +; W64-O0: buffer_store_dword v[[VRSRC4]], off, [[STOREDWORD]], s[[SRSRC32]] offset:24 +; W64-O0: buffer_store_dword v[[VRSRC5]], off, [[STOREDWORD]], s[[SRSRC32]] offset:28 +; W64-O0: buffer_store_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:8 +; W64-O0: buffer_store_dword v[[VRSRC1]], off, [[STOREDWORD]], s[[SRSRC32]] offset:12 +; W64-O0: buffer_store_dword v[[VRSRC2]], off, [[STOREDWORD]], s[[SRSRC32]] offset:16 +; W64-O0: buffer_store_dword v[[VRSRC3]], off, [[STOREDWORD]], s[[SRSRC32]] offset:20 +; W64-O0: s_mov_b32 s[[SRSRC4:[0-9]+]], 17 +; W64-O0: s_mov_b32 s[[SRSRC5:[0-9]+]], s[[SRSRC4]] +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 0 +; W64-O0: s_mov_b32 s[[SRSRC5]], 0 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 1 +; W64-O0: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W64-O0: buffer_store_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:4 +; W64-O0: s_mov_b64 [[SAVEEXEC]], exec +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC4]], 2 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 3 +; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: +; W64-O0: buffer_load_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:32 +; W64-O0: buffer_load_dword v[[VRSRC1]], off, [[STOREDWORD]], s[[SRSRC32]] offset:36 +; W64-O0: buffer_load_dword v[[VRSRC2]], off, [[STOREDWORD]], s[[SRSRC32]] offset:40 +; W64-O0: buffer_load_dword v[[VRSRC3]], off, [[STOREDWORD]], s[[SRSRC32]] offset:44 +; W64-O0: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC0]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC12:[0-9]+]], v[[VRSRC1]] +; W64-O0: s_mov_b32 s[[SRSRC4]], s[[SRSRC8]] +; W64-O0: s_mov_b32 s[[SRSRC5]], s[[SRSRC12]] +; W64-O0: v_cmp_eq_u64_e64 [[SAVEEXEC]], [[SAVEEXEC]], v{{\[[0-9]+:[0-9]+\]}} +; W64-O0: v_readfirstlane_b32 s[[SRSRC7:[0-9]+]], v[[VRSRC2]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC3]] +; W64-O0: s_mov_b32 s[[SRSRC10:[0-9]+]], s[[SRSRC7]] +; W64-O0: s_mov_b32 s[[SRSRC11:[0-9]+]], s[[SRSRC6]] +; W64-O0: v_cmp_eq_u64_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v[2:3] +; W64-O0: s_and_b64 [[SAVEEXEC]], [[SAVEEXEC]], s{{\[[0-9]+:[0-9]+\]}} +; W64-O0: s_mov_b32 s[[SRSRC9:[0-9]+]], s[[SRSRC12]] +; W64-O0: s_mov_b32 s[[SRSRC10]], s[[SRSRC7]] +; W64-O0: s_mov_b32 s[[SRSRC11]], s[[SRSRC6]] +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC8]], 4 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC9]], 5 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC10]], 6 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC11]], 7 +; W64-O0: s_and_saveexec_b64 [[SAVEEXEC]], [[SAVEEXEC]] +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC4]], 8 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 9 + +; W64-O0: ; %bb.{{[0-9]+}}: +; W64-O0: buffer_load_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:4 +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 8 +; W64-O0: v_readlane_b32 s[[SRSRC5]], v[[VRSRC8]], 9 +; W64-O0: v_readlane_b32 s[[SRSRC8]], v[[VRSRC8]], 4 +; W64-O0: v_readlane_b32 s[[SRSRC9]], v[[VRSRC8]], 5 +; W64-O0: v_readlane_b32 s[[SRSRC10]], v[[VRSRC8]], 6 +; W64-O0: v_readlane_b32 s[[SRSRC11]], v[[VRSRC8]], 7 +; W64-O0: v_readlane_b32 s[[SRSRC6]], v[[VRSRC8]], 1 +; W64-O0: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s{{\[[0-9]+:[0-9]+\]}}, s[[SRSRC6]] idxen +; W64-O0: buffer_store_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:60 +; W64-O0: s_xor_b64 exec, exec, [[SAVEEXEC]] +; W64-O0: s_cbranch_execnz [[LOOPBB1]] +; W64-O0: ; %bb.{{[0-9]+}}: +; W64-O0: buffer_load_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:60 +; W64-O0: buffer_load_dword v[[VRSRC1]], off, [[STOREDWORD]], s[[SRSRC32]] offset:56 +; W64-O0: v_readlane_b32 s[[SRSRC6]], v[[VRSRC8]], 2 +; W64-O0: v_readlane_b32 s[[SRSRC7]], v[[VRSRC8]], 3 +; W64-O0: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}} +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 1 +; W64-O0: s_mov_b32 s[[SRSRC5]], 0x3ff +; W64-O0: v_and_b32_e64 v[[VRSRC1]], v[[VRSRC1]], s[[SRSRC5]] +; W64-O0: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, v[[VRSRC1]], s[[SRSRC4]] +; W64-O0: buffer_store_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:64 +; W64-O0: s_mov_b64 [[SAVEEXEC]], exec +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC4]], 10 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 11 +; W64-O0: s_and_b64 [[SAVEEXEC]], [[SAVEEXEC]], s{{\[[0-9]+:[0-9]+\]}} +; W64-O0: s_mov_b64 exec, [[SAVEEXEC]] +; W64-O0: s_cbranch_execz [[LOOPBB5:.LBB[0-9]+_[0-9]+]] +; W64-O0: ; %bb.{{[0-9]+}}: +; W64-O0: buffer_load_dword v[[VRSRC1]], off, [[STOREDWORD]], s[[SRSRC32]] offset:8 +; W64-O0: buffer_load_dword v[[VRSRC2]], off, [[STOREDWORD]], s[[SRSRC32]] offset:12 +; W64-O0: buffer_load_dword v[[VRSRC3]], off, [[STOREDWORD]], s[[SRSRC32]] offset:16 +; W64-O0: buffer_load_dword v[[VRSRC4]], off, [[STOREDWORD]], s[[SRSRC32]] offset:20 +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 0 +; W64-O0: s_mov_b32 s[[SRSRC8]], 0 +; W64-O0: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC1]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC4]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC2]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC11]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC3]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC10]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC1]], v[[VRSRC4]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC9]], v[[VRSRC1]] +; W64-O0: s_mov_b32 s[[SRSRC5]], s[[SRSRC11]] +; W64-O0: s_mov_b32 s[[SRSRC6]], s[[SRSRC10]] +; W64-O0: s_mov_b32 s[[SRSRC7]], s[[SRSRC9]] +; W64-O0: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s{{\[[0-9]+:[0-9]+\]}}, s[[SRSRC8]] idxen +; W64-O0: buffer_store_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:64 +; W64-O0: [[LOOPBB5]] +; W64-O0: buffer_load_dword v[[VRSRC0]], off, [[STOREDWORD]], s[[SRSRC32]] offset:24 +; W64-O0: buffer_load_dword v[[VRSRC1]], off, [[STOREDWORD]], s[[SRSRC32]] offset:28 +; W64-O0: buffer_load_dword v[[VRSRC2]], off, [[STOREDWORD]], s[[SRSRC32]] offset:64 +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 10 +; W64-O0: v_readlane_b32 s[[SRSRC5]], v[[VRSRC8]], 11 +; W64-O0: s_or_b64 exec, exec, [[SAVEEXEC]] +; W64-O0: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VRSRC2]], off +; W64-O0: s_or_saveexec_b64 [[SAVEEXEC]], -1 +; W64-O0: buffer_load_dword v[[VRSRC8]], off, [[STOREDWORD]], s[[SRSRC32]] +; W64-O0: s_mov_b64 exec, [[SAVEEXEC]] +; W64-O0: s_setpc_b64 s{{\[[0-9]+:[0-9]+\]}} define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 { entry: Index: llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-buf.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-buf.ll @@ -0,0 +1,290 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s + +define float @llvm_amdgcn_raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret float %val +} + +define float @llvm_amdgcn_raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret float %val +} + +define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <2 x float> %val +} + +define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <2 x float> %val +} + +define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <3 x float> %val +} + +define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <3 x float> %val +} + +define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <4 x float> %val +} + +define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <4 x float> %val +} + +define void @llvm_amdgcn_raw_buffer_store_f32(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_f32(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 ) +declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -40,31 +40,15 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB0_1: ; %bb0 -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB0_2 Depth 2 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: flat_load_b128 v[2:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s4, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: v_readfirstlane_b32 s6, v4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_2 -; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: buffer_store_b32 v0, v0, s[0:3], 0 offen ; GFX11-NEXT: s_branch .LBB0_1 entry: br label %bb0