diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -474,65 +474,29 @@ ArrayRef BaseOps2, unsigned NumLoads, unsigned NumBytes) const { + // If the mem ops (to be clustered) do not have the same base ptr, then they + // should not be clustered assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 7; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) - return false; - - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + // In order to avoid regester pressure, on an average, the number of DWORDS + // loaded together by all clustered mem ops should not exceed 8. This is an + // empirical value based on certain observations and performance related + // experiments. + // The good thing about this heuristic is - it avoids clustering of too many + // sub-word loads, and also avoids clustering of wide loads. Below is the + // brief summary of how the heuristic behaves for various `LoadSize`. + // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops + // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops + // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops + // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops + // (5) LoadSize >= 17: do not cluster + const unsigned LoadSize = NumBytes / NumLoads; + const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; + return NumDWORDs <= 8; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -950,22 +950,22 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s8, 63 -; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -17,57 +17,56 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v7, v3, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v10 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v11 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v6, v7 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, v3, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v10, v3, v11 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v8, v2 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -156,31 +155,30 @@ ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, v3, v6 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, v3, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, v3, v2 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -194,35 +192,36 @@ ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v0 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5 -; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -399,97 +398,101 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v17, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v16, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[16:17], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[18:19], off -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v21, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v14, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v11, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v15, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -585,52 +588,52 @@ ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v10, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -648,37 +651,35 @@ ; ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 79 +; TRAP-HANDLER-DISABLE: NumSgprs: 77 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -840,14 +840,14 @@ ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 @@ -874,14 +874,14 @@ ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -177,13 +177,13 @@ ; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] -; SI-SDWA: v_or_b32_sdwa ; SI-SDWA: v_or_b32_e32 +; SI-SDWA: v_or_b32_sdwa ; SI-SDWA: v_or_b32_e32 -; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-SDWA: v_or_b32_sdwa +; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] ; SI: v_cmp_eq_u32_e32 vcc, 0 ; SI: v_cmp_ne_u64_e32 vcc, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -636,81 +636,81 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s8, 0xff -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s8, v4 -; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_movk_i32 s0, 0xff +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, s0, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v5, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 -; VI-NEXT: v_add_u16_e32 v8, 9, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 -; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: s_endpgm +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: v_mov_b32_e32 v5, 9 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_movk_i32 s0, 0x900 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 +; VI-NEXT: v_add_u16_e32 v8, 9, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -725,41 +725,42 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: v_or_b32_e32 v2, v9, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1210,171 +1210,167 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; SI-NEXT: s_mov_b32 s4, 0xffffff -; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, s4, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v2, s4, v2 -; SI-NEXT: v_mul_hi_u32 v12, v2, s5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, s4, v3 -; SI-NEXT: v_mul_hi_u32 v13, v3, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, s4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; SI-NEXT: v_mul_lo_u32 v12, v12, 24 -; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 -; SI-NEXT: v_mul_lo_u32 v13, v13, 24 -; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; SI-NEXT: v_lshr_b32_e32 v12, v14, v2 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v13 -; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2 -; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3 -; SI-NEXT: v_and_b32_e32 v13, s4, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b32_e32 v5, v5, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 -; SI-NEXT: v_lshr_b32_e32 v11, v11, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: s_mov_b32 s4, 0xffffff +; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, s4, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_mul_hi_u32 v12, v2, s5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, s4, v3 +; SI-NEXT: v_mul_hi_u32 v13, v3, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, s4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 +; SI-NEXT: v_mul_lo_u32 v12, v12, 24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 +; SI-NEXT: v_mul_lo_u32 v13, v13, 24 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; SI-NEXT: v_lshr_b32_e32 v12, v14, v2 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v13 +; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2 +; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3 +; SI-NEXT: v_and_b32_e32 v13, s4, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshl_b32_e32 v6, v6, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 +; SI-NEXT: v_lshr_b32_e32 v11, v11, v3 +; SI-NEXT: v_lshl_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; VI-NEXT: s_mov_b32 s4, 0xffffff -; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_and_b32_e32 v14, s4, v1 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_and_b32_e32 v2, s4, v2 -; VI-NEXT: v_mul_hi_u32 v12, v2, s5 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_and_b32_e32 v3, s4, v3 -; VI-NEXT: v_mul_hi_u32 v13, v3, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_and_b32_e32 v11, s4, v4 -; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; VI-NEXT: v_mul_lo_u32 v12, v12, 24 -; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 -; VI-NEXT: v_mul_lo_u32 v13, v13, 24 -; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 -; VI-NEXT: v_lshrrev_b32_e32 v12, v2, v14 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v13 -; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2 -; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3 -; VI-NEXT: v_and_b32_e32 v13, s4, v13 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v5, v13, v5 -; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 -; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v6, v14, v6 -; VI-NEXT: v_or_b32_e32 v5, v5, v12 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s4, 0xffffff +; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_and_b32_e32 v14, s4, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_and_b32_e32 v2, s4, v2 +; VI-NEXT: v_mul_hi_u32 v12, v2, s5 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_and_b32_e32 v3, s4, v3 +; VI-NEXT: v_mul_hi_u32 v13, v3, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v11, s4, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 +; VI-NEXT: v_mul_lo_u32 v12, v12, 24 +; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 +; VI-NEXT: v_mul_lo_u32 v13, v13, 24 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 +; VI-NEXT: v_lshrrev_b32_e32 v12, v2, v14 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v13 +; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2 +; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3 +; VI-NEXT: v_and_b32_e32 v13, s4, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, v13, v6 +; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 +; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v4, v14, v4 +; VI-NEXT: v_or_b32_e32 v6, v6, v12 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen +; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen +; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen +; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff -; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v10, s4, v1 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v2, s5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, s5 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v9, s4, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, v2, v10 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v7, 24, v2 -; GFX9-NEXT: v_sub_u32_e32 v10, 24, v3 -; GFX9-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, v3, v9 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v6, v8, v10, v9 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 -; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_mov_b32 s4, 0xffffff +; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v2, s5 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v9, s4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v10, s4, v8 +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, v1, v10 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, 24, v1 +; GFX9-NEXT: v_sub_u32_e32 v10, 24, v2 +; GFX9-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, v2, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc +; GFX9-NEXT: v_lshl_or_b32 v3, v3, v10, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 +; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -273,8 +273,8 @@ ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} ; MOVREL: v_movreld_b32_e32 v0, 5 -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) +; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) { diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,10 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -237,157 +237,157 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s18, 0xfc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 -; SI-NEXT: s_brev_b32 s20, 1 -; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_brev_b32 s16, -2 -; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 -; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 -; SI-NEXT: s_add_i32 s10, s0, s18 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] -; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 -; SI-NEXT: s_and_b32 s0, s15, s20 -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s15 -; SI-NEXT: s_add_i32 s8, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 -; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] -; SI-NEXT: s_and_b32 s0, s13, s20 -; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s12 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v13, s13 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; SI-NEXT: v_mov_b32_e32 v10, 0 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s18, 0xfc01 +; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s19, s0, s18 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 +; SI-NEXT: s_brev_b32 s20, 1 +; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] +; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s17 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 +; SI-NEXT: s_add_i32 s17, s0, s18 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; SI-NEXT: s_brev_b32 s16, -2 +; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 +; SI-NEXT: s_add_i32 s10, s0, s18 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] +; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 +; SI-NEXT: s_and_b32 s0, s15, s20 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] +; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v10, s15 +; SI-NEXT: s_add_i32 s8, s0, s18 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 +; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 +; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] +; SI-NEXT: s_and_b32 s0, s13, s20 +; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 +; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc +; SI-NEXT: v_mov_b32_e32 v10, 0 +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] -; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v4, s9 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 -; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[14:15] -; CI-NEXT: v_mov_b32_e32 v10, s15 -; CI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; CI-NEXT: v_bfi_b32 v10, s2, v12, v10 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v6, 0 -; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] -; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v13, s13 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s12, -2 +; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] +; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v4, s5 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] +; CI-NEXT: v_mov_b32_e32 v10, s11 +; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v6, 0 +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] +; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v13, s9 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -600,82 +600,82 @@ ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v6, s9 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 -; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] -; CI-NEXT: v_add_f64 v[4:5], s[14:15], -v[6:7] -; CI-NEXT: v_mov_b32_e32 v8, s15 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v10, s13 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v8, s19 -; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[20:21] -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[22:23] -; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v19, s23 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[22:23], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v17, s21 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] -; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_mov_b32_e32 v17, s17 -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[12:13] -; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc -; CI-NEXT: v_mov_b32_e32 v16, 0 -; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] +; CI-NEXT: v_mov_b32_e32 v4, s11 +; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v6, s9 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 +; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] +; CI-NEXT: v_add_f64 v[4:5], s[14:15], -v[6:7] +; CI-NEXT: v_mov_b32_e32 v8, s15 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v10, s13 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v8, s19 +; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[20:21] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[22:23] +; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v19, s23 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; CI-NEXT: v_add_f64 v[14:15], s[22:23], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v17, s21 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 +; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 +; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] +; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] +; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_mov_b32_e32 v17, s17 +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] +; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[12:13] +; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] +; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] +; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; CI-NEXT: v_mov_b32_e32 v16, 0 +; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -566,7 +566,6 @@ ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 ; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword v ; CI: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,9 +17,9 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 @@ -86,14 +86,14 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -300,9 +300,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -456,10 +456,10 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v3, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v3 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b32 v0, v1 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: ds_write_b32 v0, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b32 v0, v1 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v2, s1 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: ds_write_b32 v0, v2 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v0, s3 -; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s3 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v0, s3 -; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v3, s3 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll --- a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll +++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -9,14 +9,14 @@ ; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} ; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 @@ -24,14 +24,14 @@ ; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 -; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 ; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -39,6 +39,7 @@ ; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -50,14 +51,12 @@ ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x1d ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1