diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -97,6 +97,14 @@ SchedModel.init(&ST); } +// Option to enable a heuristic for computing max mem ops cluster size which is +// based on clustered bytes. +static cl::opt EnableNewMemOpsClusterHeuristic( + "amdgpu-enable-mem-ops-cluster-heuristic-based-on-clustered-bytes", + cl::desc("Enable clustered-bytes based heuristic for computing max mem ops " + "cluster size"), + cl::init(false), cl::Hidden); + //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// @@ -482,19 +490,77 @@ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - // Compute max cluster size based on average number bytes clustered till now, - // and decide based on it, if current mem ops pair can be clustered or not. - assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && - "Invalid NumLoads/NumBytes values"); - unsigned MaxNumLoads; - if (NumBytes <= 4 * NumLoads) { - // Loads are dword or smaller (on average). - MaxNumLoads = 5; + if (!EnableNewMemOpsClusterHeuristic) { + // New heuristic to compute max cluster size is `not` enabled, continue with + // the existing logic. + const MachineOperand *FirstDst = nullptr; + const MachineOperand *SecondDst = nullptr; + + if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || + (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || + (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { + const unsigned MaxGlobalLoadCluster = 7; + if (NumLoads > MaxGlobalLoadCluster) + return false; + + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + if (!FirstDst) + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + if (!SecondDst) + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } + + if (!FirstDst || !SecondDst) + return false; + + // Try to limit clustering based on the total number of bytes loaded + // rather than the number of instructions. This is done to help reduce + // register pressure. The method used is somewhat inexact, though, + // because it assumes that all loads in the cluster will load the + // same number of bytes as FirstLdSt. + + // The unit of this value is bytes. + // FIXME: This needs finer tuning. + unsigned LoadClusterThreshold = 16; + + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + + const Register Reg = FirstDst->getReg(); + + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); + + // FIXME: NumLoads should not be subtracted 1. This is to match behavior + // of clusterNeighboringMemOps which was previosly passing cluster length + // less 1. LoadClusterThreshold should be tuned instead. + return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= + LoadClusterThreshold; } else { - // Loads are bigger than a dword (on average). - MaxNumLoads = 4; + // Use new heuristic to compute max cluster size based on average number + // bytes clustered till now, and decide based on it, if current mem ops pair + // can be clustered or not. + assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && + "Invalid NumLoads/NumBytes values"); + unsigned MaxNumLoads; + if (NumBytes <= 4 * NumLoads) { + // Loads are dword or smaller (on average). + MaxNumLoads = 5; + } else { + // Loads are bigger than a dword (on average). + MaxNumLoads = 4; + } + return NumLoads <= MaxNumLoads; } - return NumLoads <= MaxNumLoads; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -235,23 +235,23 @@ ; ; GFX8-LABEL: test_div_fmas_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s2, 1, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 -; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: s_endpgm +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_and_b32 s2, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm ; ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: @@ -527,43 +527,43 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s9 -; GFX7-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 -; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: s_and_b32 s2, 1, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: s_and_b32 s0, 1, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_nop 3 -; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_endpgm +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_and_b32 s2, 1, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm ; ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 77 +; TRAP-HANDLER-DISABLE: NumSgprs: 79 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,10 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 ; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 ; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -51,38 +51,38 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 -; GCN-NEXT: v_mov_b32_e32 v16, s18 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s19 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 +; GCN-NEXT: v_mov_b32_e32 v12, s18 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v13, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 +; GCN-NEXT: s_endpgm bb: %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,8 +17,8 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 @@ -86,7 +86,6 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -96,6 +95,7 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -299,9 +299,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -455,11 +455,11 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -203,10 +203,8 @@ ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,32 +11,32 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cbranch_scc0 BB0_2 ; SI-NEXT:; %bb.1: ; %else -; SI-NEXT: s_add_i32 s2, s7, s2 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccz BB0_3 -; SI-NEXT: s_branch BB0_4 +; SI-NEXT: s_add_i32 s0, s11, s0 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz BB0_3 +; SI-NEXT: s_branch BB0_4 ; SI-NEXT:BB0_2: -; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: ; implicit-def: $sgpr2 -; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz BB0_4 +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: ; implicit-def: $sgpr0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccnz BB0_4 ; SI-NEXT:BB0_3: ; %if -; SI-NEXT: s_sub_i32 s2, s5, s6 +; SI-NEXT: s_sub_i32 s0, s9, s10 ; SI-NEXT:BB0_4: ; %endif -; SI-NEXT: s_add_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_add_i32 s0, s0, s8 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -446,12 +446,11 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 @@ -504,6 +503,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -516,12 +516,11 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 @@ -574,6 +573,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -586,8 +586,8 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: ds_write_b32 v1, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v3, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b32 v1, v3 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: ds_write_b32 v0, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v2, s1 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: ds_write_b32 v0, v2 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s3 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s3 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,40 +36,41 @@ ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: s_mov_b32 s11, s7 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x1d +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: