Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -101,7 +101,8 @@ bool NeedToBeConvertedToVALU = false; // Unique ID. Used as a key for mapping to keep permanent order. unsigned ID; - + // Flag to check if MUBUF/MTBUF needs scalar register. + bool HasMBUFScalarReg = false; // Count of another VGPR to SGPR copies that contribute to the // current copy SChain unsigned SiblingPenalty = 0; @@ -909,9 +910,17 @@ } } else if (Inst->getNumExplicitDefs() != 0) { Register Reg = Inst->getOperand(0).getReg(); - if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) - for (auto &U : MRI->use_instructions(Reg)) + if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) { + for (auto &U : MRI->use_instructions(Reg)) { Users.push_back(&U); + unsigned Opc = U.getOpcode(); + if ((MRI->getRegClass(Reg) == &AMDGPU::SGPR_128RegClass || + MRI->getRegClass(Reg) == &AMDGPU::SReg_32RegClass) && + (TII->isMUBUF(Opc) || TII->isMTBUF(Opc))) { + Info.HasMBUFScalarReg = true; + } + } + } } for (auto U : Users) { if (TII->isSALU(*U)) @@ -925,6 +934,12 @@ // The main function that computes the VGPR to SGPR copy score // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { + + if (Info->HasMBUFScalarReg) { + Info->Score = 0; + return false; + } + if (Info->SChain.empty()) { Info->Score = 0; return true; Index: llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -302,9 +302,9 @@ } ; GCN-LABEL: {{^}}vgpr_arg_src: +; GCN: s_mov_b32 s[[ONE:[0-9]+]] ; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 -; GCN: s_mov_b32 s[[ZERO:[0-9]+]] -; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]] +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ONE]]], 0x0 define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) { main_body: %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg Index: llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -7,9 +7,9 @@ ; from constant/invariant memory. ; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load: -; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], +; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], off, [[PTR1:s\[[0-9]+:[0-9]+\]]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b -; GCN: buffer_store_dword [[K]], [[PTR]] +; GCN: buffer_store_dword [[K]], off, [[PTR1]] define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -7,83 +7,45 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX10-LABEL: main: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_mov_b32 s1, exec_lo -; GFX10-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readfirstlane_b32 s6, v2 -; GFX10-NEXT: v_readfirstlane_b32 s7, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s0 -; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_format_d16_xyz v[0:1], v4, s[0:3], 0 idxen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8 ; ; GFX9-LABEL: main: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: v_readfirstlane_b32 s7, v3 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 -; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 3 +; GFX9-NEXT: buffer_load_format_d16_xyz v[0:1], v4, s[0:3], 0 idxen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8 ; ; GFX8-LABEL: main: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_readfirstlane_b32 s6, v2 -; GFX8-NEXT: v_readfirstlane_b32 s7, v3 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX8-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: buffer_load_format_d16_xyz v[0:1], v4, s[0:3], 0 idxen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8 bb: %i = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 undef) %i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %arg, i32 %arg1, i32 0, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -6,17 +6,12 @@ ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN-DAG: s_load_dwordx2 s[[[SLDPTRLO:[0-9]+]]:[[SLDPTRHI:[0-9]+]]], s[[[SLDPTRLO]]:[[SLDPTRHI]]], 0x0 ; GCN-DAG: buffer_load_dwordx2 v[[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]] -; GCN-DAG: s_load_dwordx2 s[[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] -; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] -; GCN-NOT: v_mov_b32 -; GCN-NOT: v_mov_b32 - -; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] -; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] -; GCN: buffer_load_ubyte v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]], +; GCN-DAG: v_mov_b32_e32 v[[LDPTRLO]], s[[SLDPTRLO]] +; GCN-DAG: v_mov_b32_e32 v[[LDPTRHI]], s[[SLDPTRHI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v[[[LDPTRLO]]:[[LDPTRHI]]], define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { bb: Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -1,126 +1,77 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64 -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32 -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32-GFX11 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-GFX11 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions. ; W64-LABEL: mubuf_vgpr -; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; W64: [[LOOPBB:.LBB[0-9]+_[0-9]+]]: ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] ; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[AND]] -; W64: s_cbranch_execnz [[LOOPBB]] -; W64: s_mov_b64 exec, [[SAVEEXEC]] -; W64: v_mov_b32_e32 v0, [[RES]] +; W64: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] ; W32-LABEL: mubuf_vgpr -; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo -; W32: [[LOOPBB:.LBB[0-9]+_[0-9]+]]: ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] ; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB]] -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] -; W32: v_mov_b32_e32 v0, [[RES]] +; W32: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1 ret float %call } - ; W64-LABEL: mubuf_vgpr_adjacent_in_block - -; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; W64: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: -; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[SAVE]] -; W64: s_cbranch_execnz [[LOOPBB0]] - -; W64: s_mov_b64 exec, [[SAVEEXEC]] -; FIXME: redundant s_mov -; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[SAVE]] -; W64: s_cbranch_execnz [[LOOPBB1]] - -; W64: s_mov_b64 exec, [[SAVEEXEC]] -; W64-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES0]], off -; W64-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES1]], off - +; W64-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC4:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC5:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC7:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64: buffer_load_format_x v[[VRSRC0]], v[[SRSRC8]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W64: buffer_load_format_x v[[VRSRC1]], v[[SRSRC8]], s[[[SRSRC4]]:[[SRSRC7]]], 0 idxen +; W64: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC0]], off +; W64: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC1]], off +; W64: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] + +; W64-GFX11-LABEL: mubuf_vgpr_adjacent_in_block +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC4:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC5:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC7:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W64-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64-GFX11: buffer_load_format_x v[[VRSRC0]], v{{[0-9]+}}, s[[[SRSRC4]]:[[SRSRC7]]], 0 idxen +; W64-GFX11: buffer_load_format_x v[[VRSRC1]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen +; W64-GFX11: global_store_b32 v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC0]], off dlc +; W64-GFX11: global_store_b32 v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC1]], off dlc +; W64-GFX11: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] ; W32-LABEL: mubuf_vgpr_adjacent_in_block - -; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo -; W32: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: -; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB0]] - -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] -; FIXME: redundant s_mov -; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo - -; W32: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB1]] - -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] -; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES0]], off -; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES1]], off +; W32-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC4:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC5:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC7:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W32: buffer_load_format_x v[[VRSRC0]], v[[SRSRC8]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W32: buffer_load_format_x v[[VRSRC1]], v[[SRSRC8]], s[[[SRSRC4]]:[[SRSRC7]]], 0 idxen +; W32: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC0]], off +; W32: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v[[VRSRC1]], off +; W32: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 { entry: @@ -131,181 +82,164 @@ ret void } - ; W64-LABEL: mubuf_vgpr_outside_entry - -; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} -; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: -; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[SAVE]] -; W64: s_cbranch_execnz [[LOOPBB0]] - -; W64: s_mov_b64 exec, [[SAVEEXEC]] -; W64: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - ; W64: ; %bb.{{[0-9]+}}: -; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}} -; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] -; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W64: s_xor_b64 exec, exec, [[SAVE]] -; W64: s_cbranch_execnz [[LOOPBB1]] - -; W64: s_mov_b64 exec, [[SAVEEXEC]] - -; W64: [[TERMBB]]: -; W64: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off - +; W64-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W64: s_mov_b32 s[[SRSRC4:[0-9]+]], 17 +; W64: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W64: v_and_b32_e32 v[[VRSRC1]], 0x3ff, v{{[0-9]+}} +; W64-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[VRSRC1]] +; W64-DAG: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; W64: s_cbranch_execz [[LOOPBB2:.LBB[0-9]+_[0-9]+]] +; W64: ; %bb.{{[0-9]+}}: +; W64: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W64-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W64: [[LOOPBB2]]: +; W64: s_or_b64 exec, exec, [[SAVE]] +; W64: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VRSRC0]], off +; W64: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] ; W32-LABEL: mubuf_vgpr_outside_entry - -; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 -; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo - -; W32: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: -; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB0]] - -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] -; W32: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - ; W32: ; %bb.{{[0-9]+}}: -; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4 -; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo - -; W32: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: -; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] -; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] -; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]] -; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]] -; W32: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen -; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]] -; W32: s_cbranch_execnz [[LOOPBB1]] - -; W32: s_mov_b32 exec_lo, [[SAVEEXEC]] - -; W32: [[TERMBB]]: -; W32: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off - - -; Confirm spills do not occur between the XOR and branch that terminate the -; waterfall loop BBs. +; W32-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W32: s_mov_b32 s[[SRSRC4:[0-9]+]], 17 +; W32: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W32: v_and_b32_e32 v[[VRSRC1]], 0x3ff, v{{[0-9]+}} +; W32: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W32: v_cmp_eq_u32_e32 vcc_lo, 0, v[[VRSRC1]] +; W32: s_and_saveexec_b32 s[[SRSRC5:[0-9]+]], vcc_lo +; W32: s_cbranch_execz [[LOOPBB2:.LBB[0-9]+_[0-9]+]] +; W32: ; %bb.{{[0-9]+}}: +; W32: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC9:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC10:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W32-DAG: v_readfirstlane_b32 s[[SRSRC11:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W32: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC8]]:[[SRSRC11]]], 0 idxen +; W32: [[LOOPBB2]]: +; W32: s_or_b32 exec_lo, exec_lo, s[[SRSRC5]] +; W32: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VRSRC0]], off +; W32: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] + +; W32-GFX11-LABEL: mubuf_vgpr_outside_entry +; W32-GFX11: ; %bb.{{[0-9]+}}: +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]] +; W32-GFX11: s_mov_b32 s[[SRSRC4:[0-9]+]], 17 +; W32-GFX11: v_dual_mov_b32 v[[VRSRC0]], s[[SRSRC4]] :: v_dual_and_b32 v[[VRSRC1]], 0x3ff, v31 +; W32-GFX11: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen +; W32-GFX11: s_mov_b32 s[[SRSRC0]], exec_lo +; W32-GFX11: v_cmpx_eq_u32_e32 0, v[[VRSRC1]] +; W32-GFX11: s_cbranch_execz [[LOOPBB2:.LBB[0-9]+_[0-9]+]] +; W32-GFX11: ; %bb.{{[0-9]+}}: +; W32-GFX11: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC4:[0-9]+]], v[[VRSRC4:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC5:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC6:[0-9]+]] +; W32-GFX11-DAG: v_readfirstlane_b32 s[[SRSRC7:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W32-GFX11: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC4]]:[[SRSRC7]]], 0 idxen +; W32-GFX11: [[LOOPBB2]]: +; W32-GFX11: s_or_b32 exec_lo, exec_lo, s[[SRSRC0]] +; W32-GFX11: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, v[[VRSRC0]], off dlc +; W32-GFX11: s_setpc_b64 s[{{[0-9]+}}:{{[0-9]+}}] ; W64-O0-LABEL: mubuf_vgpr_outside_entry - -; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}} -; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}} -; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec - -; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload -; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_xor_b64 exec, exec, [[SAVE]] -; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] - -; XXX-W64-O0: s_mov_b64 exec, [[SAVEEXEC]] -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] - -; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 -; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill -; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec -; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] -; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] - -; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]] -; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]] -; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]] -; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] -; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] -; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload -; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen -; W64-O0: s_waitcnt vmcnt(0) -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill -; W64-O0: s_xor_b64 exec, exec, [[SAVE]] -; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] - -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] -; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] -; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] -; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill - -; W64-O0: [[TERMBB]]: -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Reload -; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off +; W64-O0: ; %bb.0: ; %entry +; W64-O0: s_or_saveexec_b64 s[[[SRSRC4:[0-9]+]]:[[SRSRC5:[0-9]+]]], -1 +; W64-O0: buffer_store_dword v[[VRSRC8:[0-9]+]], off, s[[[SRSRC0:[0-9]+]]:[[SRSRC3:[0-9]+]]], s[[SRSRC32:[0-9]+]] +; W64-O0: s_mov_b64 exec, s[[[SRSRC4]]:[[SRSRC5]]] +; W64-O0: buffer_store_dword v[[VRSRC11:[0-9]+]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:36 +; W64-O0: v_mov_b32_e32 v[[VRSRC10:[0-9]+]], v[[VRSRC7:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC11]], v[[VRSRC6:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC13:[0-9]+]], v[[VRSRC5:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC4:[0-9]+]] +; W64-O0: buffer_load_dword v[[VRSRC4]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:36 +; W64-O0: buffer_store_dword v[[VRSRC5]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:32 +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC3:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC6]], v[[VRSRC2:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC7]], v[[VRSRC1:[0-9]+]] +; W64-O0: v_mov_b32_e32 v[[VRSRC9:[0-9]+]], v[[VRSRC0:[0-9]+]] +; W64-O0: buffer_load_dword v[[VRSRC0]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:32 +; W64-O0: v_mov_b32_e32 v[[VRSRC1]], v[[VRSRC13]] +; W64-O0: v_mov_b32_e32 v[[VRSRC2]], v[[VRSRC11]] +; W64-O0: v_mov_b32_e32 v[[VRSRC3]], v[[VRSRC10]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC8:[0-9]+]], v[[VRSRC9]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC6:[0-9]+]], v[[VRSRC7]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC5:[0-9]+]], v[[VRSRC6]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC4:[0-9]+]], v[[VRSRC5]] +; W64-O0: s_mov_b32 s[[SRSRC9:[0-9]+]], s[[SRSRC6]] +; W64-O0: s_mov_b32 s[[SRSRC10:[0-9]+]], s[[SRSRC5]] +; W64-O0: s_mov_b32 s[[SRSRC11:[0-9]+]], s[[SRSRC4]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC12:[0-9]+]] +; W64-O0: buffer_store_dword v[[VRSRC4]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:24 +; W64-O0: buffer_store_dword v[[VRSRC5]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:28 +; W64-O0: buffer_store_dword v[[VRSRC0]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:8 +; W64-O0: buffer_store_dword v[[VRSRC1]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:12 +; W64-O0: buffer_store_dword v[[VRSRC2]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:16 +; W64-O0: buffer_store_dword v[[VRSRC3]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:20 +; W64-O0: s_mov_b32 s[[SRSRC4]], 17 +; W64-O0: s_mov_b32 s[[SRSRC5]], s[[SRSRC4]] +; W64-O0: s_mov_b32 s[[SRSRC4]], s[[SRSRC5]] +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC4]], 0 +; W64-O0: s_mov_b32 s[[SRSRC4]], 0 +; W64-O0: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC5]] +; W64-O0: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC8]]:[[SRSRC11]]], s[[SRSRC4]] idxen +; W64-O0: s_mov_b32 s[[SRSRC5]], 0x3ff +; W64-O0: v_and_b32_e64 v[[VRSRC1]], v[[VRSRC31:[0-9]+]], s[[SRSRC5]] +; W64-O0: v_cmp_eq_u32_e64 s[[[SRSRC6]]:[[SRSRC7:[0-9]+]]], v[[VRSRC1]], s[[SRSRC4]] +; W64-O0: buffer_store_dword v[[VRSRC0]], off, s[0:3], s32 offset:4 +; W64-O0: s_mov_b64 s[[[SRSRC4]]:[[SRSRC5]]], exec +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC4]], 1 +; W64-O0: v_writelane_b32 v[[VRSRC8]], s[[SRSRC5]], 2 +; W64-O0: s_and_b64 s[4:5], s[[[SRSRC4]]:[[SRSRC5]]], s[[[SRSRC6]]:[[SRSRC7]]] +; W64-O0: s_mov_b64 exec, s[[[SRSRC4]]:[[SRSRC5]]] +; W64-O0: s_cbranch_execz [[LOOPBB2:.LBB[0-9]+_[0-9]+]] +; W64-O0:; %bb.1: ; %bb1 +; W64-O0: buffer_load_dword v[[VRSRC1]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:8 +; W64-O0: buffer_load_dword v[[VRSRC2]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:12 +; W64-O0: buffer_load_dword v[[VRSRC3]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:16 +; W64-O0: buffer_load_dword v[[VRSRC4]], off, s[[[SRSRC0]]:[[SRSRC3]]], s32 offset:20 +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 0 +; W64-O0: s_mov_b32 s[[SRSRC8]], 0 +; W64-O0: v_mov_b32_e32 v[[VRSRC0]], s[[SRSRC4]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC1]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC4]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC2]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC11]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC5]], v[[VRSRC3]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC10]], v[[VRSRC5]] +; W64-O0: v_mov_b32_e32 v[[VRSRC1]], v[[VRSRC4]] +; W64-O0: v_readfirstlane_b32 s[[SRSRC9]], v[[VRSRC1]] +; W64-O0: s_mov_b32 s[[SRSRC5]], s[[SRSRC11]] +; W64-O0: s_mov_b32 s[[SRSRC6]], s[[SRSRC10]] +; W64-O0: s_mov_b32 s[[SRSRC7]], s[[SRSRC9]] +; W64-O0: buffer_load_format_x v[[VRSRC0]], v[[VRSRC0]], s[[[SRSRC4]]:[[SRSRC7]]], s[[SRSRC8]] idxen +; W64-O0: buffer_store_dword v[[VRSRC0]], off, s[[[SRSRC0]]:[[SRSRC3]]], s[[SRSRC32]] offset:4 +; W64-O0: [[LOOPBB2]]: ; %bb2 +; W64-O0: buffer_load_dword v[[VRSRC0]], off, s[[[SRSRC0]]:[[SRSRC3]]], s32 offset:24 +; W64-O0: buffer_load_dword v[[VRSRC1]], off, s[[[SRSRC0]]:[[SRSRC3]]], s32 offset:28 +; W64-O0: buffer_load_dword v[[VRSRC2]], off, s[[[SRSRC0]]:[[SRSRC3]]], s32 offset:4 +; W64-O0: v_readlane_b32 s[[SRSRC4]], v[[VRSRC8]], 1 +; W64-O0: v_readlane_b32 s[[SRSRC5]], v[[VRSRC8]], 2 +; W64-O0: s_or_b64 exec, exec, s[[[SRSRC4]]:[[SRSRC5]]] +; W64-O0: global_store_dword v[[[VRSRC0]]:[[VRSRC1]]], v[[VRSRC2]], off +; W64-O0: s_or_saveexec_b64 s[[[SRSRC4]]:[[SRSRC5]]], -1 +; W64-O0: buffer_load_dword v[[VRSRC8]], off, s[[[SRSRC0]]:[[SRSRC3]]], s32 +; W64-O0: s_mov_b64 exec, s[[[SRSRC4]]:[[SRSRC5]]] +; W64-O0: s_setpc_b64 s[[[SRSRC30:[0-9]+]]:[[SRSRC31:[0-9]+]]] define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 { entry: Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -5,7 +5,6 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32 # RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64 # RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32 - # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions. # # On ADDR64 hardware we optimize the _ADDR64 and _OFFSET cases to avoid @@ -27,97 +26,39 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-LABEL: name: idxen - ; W64: successors: %bb.1(0x80000000) - ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .1: - ; W64-NEXT: successors: %bb.2(0x80000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .2: - ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-NEXT: {{ $}} - ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] - ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W32-LABEL: name: idxen - ; W32: successors: %bb.1(0x80000000) - ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; W32-NEXT: {{ $}} - ; W32-NEXT: .1: - ; W32-NEXT: successors: %bb.2(0x80000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W32-NEXT: {{ $}} - ; W32-NEXT: .2: - ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} - ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] - ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 @@ -147,97 +88,39 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-LABEL: name: offen - ; W64: successors: %bb.1(0x80000000) - ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-NEXT: {{ $}} - ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .1: - ; W64-NEXT: successors: %bb.2(0x80000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .2: - ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] - ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W32-LABEL: name: offen - ; W32: successors: %bb.1(0x80000000) - ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; W32-NEXT: {{ $}} - ; W32-NEXT: .1: - ; W32-NEXT: successors: %bb.2(0x80000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W32-NEXT: {{ $}} - ; W32-NEXT: .2: - ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} - ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] - ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 @@ -267,97 +150,39 @@ bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-LABEL: name: bothen - ; W64: successors: %bb.1(0x80000000) - ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-NEXT: {{ $}} - ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .1: - ; W64-NEXT: successors: %bb.2(0x80000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .2: - ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W64-NEXT: {{ $}} - ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec - ; W64-NEXT: {{ $}} - ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] - ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4 + ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W32-LABEL: name: bothen - ; W32: successors: %bb.1(0x80000000) - ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} - ; W32-NEXT: .1: - ; W32-NEXT: successors: %bb.2(0x80000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W32-NEXT: {{ $}} - ; W32-NEXT: .2: - ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec - ; W32-NEXT: {{ $}} - ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] - ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 @@ -389,60 +214,38 @@ ; ADDR64-LABEL: name: addr64 ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; ADDR64-NEXT: {{ $}} - ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 - ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 - ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] - ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; ADDR64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; ADDR64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] + ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64]] ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W32-LABEL: name: addr64 - ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} - ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696 - ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 - ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] - ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; W32-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] + ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 %4:vreg_64 = COPY $vgpr4_vgpr5 @@ -458,7 +261,6 @@ ... - --- name: offset liveins: @@ -474,122 +276,55 @@ ; ADDR64-LABEL: name: offset ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; ADDR64-NEXT: {{ $}} - ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 - ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1 - ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] - ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] + ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; ADDR64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; ADDR64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; ADDR64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] + ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W64-NO-ADDR64-LABEL: name: offset - ; W64-NO-ADDR64: successors: %bb.1(0x80000000) - ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 + ; W64-NO-ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W64-NO-ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; W64-NO-ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W64-NO-ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W64-NO-ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W64-NO-ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W64-NO-ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec - ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: .1: - ; W64-NO-ADDR64-NEXT: successors: %bb.2(0x80000000) - ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W64-NO-ADDR64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W64-NO-ADDR64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: .2: - ; W64-NO-ADDR64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W64-NO-ADDR64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec - ; W64-NO-ADDR64-NEXT: {{ $}} - ; W64-NO-ADDR64-NEXT: .3: - ; W64-NO-ADDR64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] - ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W64-NO-ADDR64-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W64-NO-ADDR64-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W64-NO-ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W64-NO-ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W64-NO-ADDR64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W64-NO-ADDR64-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; W32-LABEL: name: offset - ; W32: successors: %bb.1(0x80000000) - ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 - ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 - ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; W32-NEXT: {{ $}} - ; W32-NEXT: .1: - ; W32-NEXT: successors: %bb.2(0x80000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec - ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec - ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc - ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; W32-NEXT: {{ $}} - ; W32-NEXT: .2: - ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; W32-NEXT: {{ $}} - ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31 ; W32-NEXT: {{ $}} - ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] - ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] + ; W32-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; W32-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; W32-NEXT: [[COPY0:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; W32-NEXT: [[V_READFIRSTLANE8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY0]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; W32-NEXT: [[V_READFIRSTLANE11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE8]], %subreg.sub0, [[V_READFIRSTLANE9]], %subreg.sub1, [[V_READFIRSTLANE10]], %subreg.sub2, [[V_READFIRSTLANE11]], %subreg.sub3 + ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY5]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %5:sreg_64 = COPY $sgpr30_sgpr31 Index: llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-buf.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-buf.ll @@ -0,0 +1,210 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s + +define float @llvm_amdgcn_raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret float %val +} + +define float @llvm_amdgcn_raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret float %val +} + +define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <2 x float> %val +} + +define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <2 x float> %val +} + +define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <3 x float> %val +} + +define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <3 x float> %val +} + +define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_load_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + %val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <4 x float> %val +} + +define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_load_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + %val = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret <4 x float> %val +} + +define void @llvm_amdgcn_raw_buffer_store_f32(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_f32(<4 x i32> inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<4 x i32> inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v2f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<4 x i32> inreg %rsrc, <3 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v3f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_buffer_store_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x i32> inreg %rsrc, <4 x float> %val, i32 %voffset, i32 inreg %soffset) { + ; CHECK-LABEL: name: llvm_amdgcn_raw_tbuffer_store_v4f32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK: [[V_READFIRSTLANE_B32:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact {{.*}}, {{.*}}, killed {{.*}}, [[V_READFIRSTLANE_B32]], 0, 0, 0, 0, 0, implicit $exec + + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 ) +declare float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) +declare <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) +declare <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -4,68 +4,39 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; GCN-NEXT: .LBB0_1: ; %bb0 -; GCN-NEXT: ; =>This Loop Header: Depth=1 -; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 -; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v2 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] -; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 -; GCN-NEXT: s_and_saveexec_b32 s4, s4 -; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen -; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_cbranch_execnz .LBB0_2 -; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: s_branch .LBB0_1 +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: .LBB0_1: ; %bb0 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; GCN-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GCN-NEXT: v_readfirstlane_b32 s4, v4 +; GCN-NEXT: v_readfirstlane_b32 s5, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen +; GCN-NEXT: s_branch .LBB0_1 ; ; GFX11-LABEL: vgpr_descriptor_waterfall_loop_idom_update: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .p2align 6 +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_1: ; %bb0 -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB0_2 Depth 2 -; GFX11-NEXT: flat_load_b128 v[2:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s4, v2 -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: v_readfirstlane_b32 s6, v4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_2 -; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_branch .LBB0_1 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: flat_load_b128 v[2:5], v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 +; GFX11-NEXT: buffer_store_b32 v0, v0, s[0:3], 0 offen +; GFX11-NEXT: s_branch .LBB0_1 entry: br label %bb0